LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
658 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
659 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
660 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
661 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
662 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
663 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
664 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
665 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
666 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(RetTy);
669 const auto *Entry =
670 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(ValidSatTys, equal_to(LT.second)))
688 return LT.first * Instrs;
689
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702 MVT::nxv4i32, MVT::nxv2i64};
703 auto LT = getTypeLegalizationCost(RetTy);
704 if (any_of(ValidAbsTys, equal_to(LT.second)))
705 return LT.first;
706 break;
707 }
708 case Intrinsic::bswap: {
709 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710 MVT::v4i32, MVT::v2i64};
711 auto LT = getTypeLegalizationCost(RetTy);
712 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
713 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714 return LT.first;
715 break;
716 }
717 case Intrinsic::fma:
718 case Intrinsic::fmuladd: {
719 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
720 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721 Type *EltTy = RetTy->getScalarType();
722 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
723 (EltTy->isHalfTy() && ST->hasFullFP16()))
724 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
725 break;
726 }
727 case Intrinsic::stepvector: {
728 InstructionCost Cost = 1; // Cost of the `index' instruction
729 auto LT = getTypeLegalizationCost(RetTy);
730 // Legalisation of illegal vectors involves an `index' instruction plus
731 // (LT.first - 1) vector adds.
732 if (LT.first > 1) {
733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
734 InstructionCost AddCost =
735 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
736 Cost += AddCost * (LT.first - 1);
737 }
738 return Cost;
739 }
740 case Intrinsic::vector_extract:
741 case Intrinsic::vector_insert: {
742 // If both the vector and subvector types are legal types and the index
743 // is 0, then this should be a no-op or simple operation; return a
744 // relatively low cost.
745
746 // If arguments aren't actually supplied, then we cannot determine the
747 // value of the index. We also want to skip predicate types.
748 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
750 break;
751
752 LLVMContext &C = RetTy->getContext();
753 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
754 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
756 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
757 // Skip this if either the vector or subvector types are unpacked
758 // SVE types; they may get lowered to stack stores and loads.
759 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
760 break;
761
763 getTLI()->getTypeConversion(C, SubVecVT);
765 getTLI()->getTypeConversion(C, VecVT);
766 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
767 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
768 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770 return TTI::TCC_Free;
771 break;
772 }
773 case Intrinsic::bitreverse: {
774 static const CostTblEntry BitreverseTbl[] = {
775 {Intrinsic::bitreverse, MVT::i32, 1},
776 {Intrinsic::bitreverse, MVT::i64, 1},
777 {Intrinsic::bitreverse, MVT::v8i8, 1},
778 {Intrinsic::bitreverse, MVT::v16i8, 1},
779 {Intrinsic::bitreverse, MVT::v4i16, 2},
780 {Intrinsic::bitreverse, MVT::v8i16, 2},
781 {Intrinsic::bitreverse, MVT::v2i32, 2},
782 {Intrinsic::bitreverse, MVT::v4i32, 2},
783 {Intrinsic::bitreverse, MVT::v1i64, 2},
784 {Intrinsic::bitreverse, MVT::v2i64, 2},
785 };
786 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
787 const auto *Entry =
788 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
789 if (Entry) {
790 // Cost Model is using the legal type(i32) that i8 and i16 will be
791 // converted to +1 so that we match the actual lowering cost
792 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
793 TLI->getValueType(DL, RetTy, true) == MVT::i16)
794 return LegalisationCost.first * Entry->Cost + 1;
795
796 return LegalisationCost.first * Entry->Cost;
797 }
798 break;
799 }
800 case Intrinsic::ctpop: {
801 if (!ST->hasNEON()) {
802 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
803 return getTypeLegalizationCost(RetTy).first * 12;
804 }
805 static const CostTblEntry CtpopCostTbl[] = {
806 {ISD::CTPOP, MVT::v2i64, 4},
807 {ISD::CTPOP, MVT::v4i32, 3},
808 {ISD::CTPOP, MVT::v8i16, 2},
809 {ISD::CTPOP, MVT::v16i8, 1},
810 {ISD::CTPOP, MVT::i64, 4},
811 {ISD::CTPOP, MVT::v2i32, 3},
812 {ISD::CTPOP, MVT::v4i16, 2},
813 {ISD::CTPOP, MVT::v8i8, 1},
814 {ISD::CTPOP, MVT::i32, 5},
815 };
816 auto LT = getTypeLegalizationCost(RetTy);
817 MVT MTy = LT.second;
818 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
819 // Extra cost of +1 when illegal vector types are legalized by promoting
820 // the integer type.
821 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
822 RetTy->getScalarSizeInBits()
823 ? 1
824 : 0;
825 return LT.first * Entry->Cost + ExtraCost;
826 }
827 break;
828 }
829 case Intrinsic::sadd_with_overflow:
830 case Intrinsic::uadd_with_overflow:
831 case Intrinsic::ssub_with_overflow:
832 case Intrinsic::usub_with_overflow:
833 case Intrinsic::smul_with_overflow:
834 case Intrinsic::umul_with_overflow: {
835 static const CostTblEntry WithOverflowCostTbl[] = {
836 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
837 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
838 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
839 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
840 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
841 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
842 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
843 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
844 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
845 {Intrinsic::usub_with_overflow, MVT::i8, 3},
846 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
847 {Intrinsic::usub_with_overflow, MVT::i16, 3},
848 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
849 {Intrinsic::usub_with_overflow, MVT::i32, 1},
850 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
851 {Intrinsic::usub_with_overflow, MVT::i64, 1},
852 {Intrinsic::smul_with_overflow, MVT::i8, 5},
853 {Intrinsic::umul_with_overflow, MVT::i8, 4},
854 {Intrinsic::smul_with_overflow, MVT::i16, 5},
855 {Intrinsic::umul_with_overflow, MVT::i16, 4},
856 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
857 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
858 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
859 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
860 };
861 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
862 if (MTy.isSimple())
863 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
864 MTy.getSimpleVT()))
865 return Entry->Cost;
866 break;
867 }
868 case Intrinsic::fptosi_sat:
869 case Intrinsic::fptoui_sat: {
870 if (ICA.getArgTypes().empty())
871 break;
872 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
873 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
874 EVT MTy = TLI->getValueType(DL, RetTy);
875 // Check for the legal types, which are where the size of the input and the
876 // output are the same, or we are using cvt f64->i32 or f32->i64.
877 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
878 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
879 LT.second == MVT::v2f64)) {
880 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
881 (LT.second == MVT::f64 && MTy == MVT::i32) ||
882 (LT.second == MVT::f32 && MTy == MVT::i64)))
883 return LT.first;
884 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
885 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
886 MTy.getScalarSizeInBits() == 64)
887 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
888 }
889 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
890 // f32.
891 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
892 return LT.first + getIntrinsicInstrCost(
893 {ICA.getID(),
894 RetTy,
895 {ICA.getArgTypes()[0]->getWithNewType(
896 Type::getFloatTy(RetTy->getContext()))}},
897 CostKind);
898 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
899 (LT.second == MVT::f16 && MTy == MVT::i64) ||
900 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
901 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
902 return LT.first;
903 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
904 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
905 MTy.getScalarSizeInBits() == 32)
906 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
907 // Extending vector types v8f16->v8i32. These current scalarize but the
908 // codegen could be better.
909 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
910 MTy.getScalarSizeInBits() == 64)
911 return MTy.getVectorNumElements() * 3;
912
913 // If we can we use a legal convert followed by a min+max
914 if ((LT.second.getScalarType() == MVT::f32 ||
915 LT.second.getScalarType() == MVT::f64 ||
916 LT.second.getScalarType() == MVT::f16) &&
917 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
918 Type *LegalTy =
919 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
920 if (LT.second.isVector())
921 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
923 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
924 : Intrinsic::umin,
925 LegalTy, {LegalTy, LegalTy});
927 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
928 : Intrinsic::umax,
929 LegalTy, {LegalTy, LegalTy});
931 return LT.first * Cost +
932 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
933 : 1);
934 }
935 // Otherwise we need to follow the default expansion that clamps the value
936 // using a float min/max with a fcmp+sel for nan handling when signed.
937 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
938 RetTy = RetTy->getScalarType();
939 if (LT.second.isVector()) {
940 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
941 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
942 }
943 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
945 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
947 Cost +=
948 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
950 if (IsSigned) {
951 Type *CondTy = RetTy->getWithNewBitWidth(1);
952 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
954 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
956 }
957 return LT.first * Cost;
958 }
959 case Intrinsic::fshl:
960 case Intrinsic::fshr: {
961 if (ICA.getArgs().empty())
962 break;
963
964 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
965
966 // ROTR / ROTL is a funnel shift with equal first and second operand. For
967 // ROTR on integer registers (i32/i64) this can be done in a single ror
968 // instruction. A fshl with a non-constant shift uses a neg + ror.
969 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
970 (RetTy->getPrimitiveSizeInBits() == 32 ||
971 RetTy->getPrimitiveSizeInBits() == 64)) {
972 InstructionCost NegCost =
973 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
974 return 1 + NegCost;
975 }
976
977 // TODO: Add handling for fshl where third argument is not a constant.
978 if (!OpInfoZ.isConstant())
979 break;
980
981 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
982 if (OpInfoZ.isUniform()) {
983 static const CostTblEntry FshlTbl[] = {
984 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
985 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
986 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
987 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
988 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
989 // to avoid having to duplicate the costs.
990 const auto *Entry =
991 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
992 if (Entry)
993 return LegalisationCost.first * Entry->Cost;
994 }
995
996 auto TyL = getTypeLegalizationCost(RetTy);
997 if (!RetTy->isIntegerTy())
998 break;
999
1000 // Estimate cost manually, as types like i8 and i16 will get promoted to
1001 // i32 and CostTableLookup will ignore the extra conversion cost.
1002 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1003 RetTy->getScalarSizeInBits() < 64) ||
1004 (RetTy->getScalarSizeInBits() % 64 != 0);
1005 unsigned ExtraCost = HigherCost ? 1 : 0;
1006 if (RetTy->getScalarSizeInBits() == 32 ||
1007 RetTy->getScalarSizeInBits() == 64)
1008 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1009 // extr instruction.
1010 else if (HigherCost)
1011 ExtraCost = 1;
1012 else
1013 break;
1014 return TyL.first + ExtraCost;
1015 }
1016 case Intrinsic::get_active_lane_mask: {
1017 auto RetTy = cast<VectorType>(ICA.getReturnType());
1018 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1019 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1020 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1021 break;
1022
1023 if (RetTy->isScalableTy()) {
1024 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1026 break;
1027
1028 auto LT = getTypeLegalizationCost(RetTy);
1029 InstructionCost Cost = LT.first;
1030 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1031 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1032 // nxv32i1 = get_active_lane_mask(base, idx) ->
1033 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1034 if (ST->hasSVE2p1() || ST->hasSME2()) {
1035 Cost /= 2;
1036 if (Cost == 1)
1037 return Cost;
1038 }
1039
1040 // If more than one whilelo intrinsic is required, include the extra cost
1041 // required by the saturating add & select required to increment the
1042 // start value after the first intrinsic call.
1043 Type *OpTy = ICA.getArgTypes()[0];
1044 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1045 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1046 Type *CondTy = OpTy->getWithNewBitWidth(1);
1047 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1049 return Cost + (SplitCost * (Cost - 1));
1050 } else if (!getTLI()->isTypeLegal(RetVT)) {
1051 // We don't have enough context at this point to determine if the mask
1052 // is going to be kept live after the block, which will force the vXi1
1053 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1054 // For now, we just assume the vectorizer created this intrinsic and
1055 // the result will be the input for a PHI. In this case the cost will
1056 // be extremely high for fixed-width vectors.
1057 // NOTE: getScalarizationOverhead returns a cost that's far too
1058 // pessimistic for the actual generated codegen. In reality there are
1059 // two instructions generated per lane.
1060 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1061 }
1062 break;
1063 }
1064 case Intrinsic::experimental_vector_match: {
1065 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1066 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1067 unsigned SearchSize = NeedleTy->getNumElements();
1068 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1069 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1070 // Neoverse V3, these are cheap operations with the same latency as a
1071 // vector ADD. In most cases, however, we also need to do an extra DUP.
1072 // For fixed-length vectors we currently need an extra five--six
1073 // instructions besides the MATCH.
1075 if (isa<FixedVectorType>(RetTy))
1076 Cost += 10;
1077 return Cost;
1078 }
1079 break;
1080 }
1081 case Intrinsic::cttz: {
1082 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1083 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1084 return LT.first * 2;
1085 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1086 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1087 return LT.first * 3;
1088 break;
1089 }
1090 case Intrinsic::experimental_cttz_elts: {
1091 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1092 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1093 // This will consist of a SVE brkb and a cntp instruction. These
1094 // typically have the same latency and half the throughput as a vector
1095 // add instruction.
1096 return 4;
1097 }
1098 break;
1099 }
1100 case Intrinsic::loop_dependence_raw_mask:
1101 case Intrinsic::loop_dependence_war_mask: {
1102 // The whilewr/rw instructions require SVE2 or SME.
1103 if (ST->hasSVE2() || ST->hasSME()) {
1104 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1105 unsigned EltSizeInBytes =
1106 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1107 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1108 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1109 break;
1110 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1111 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1112 }
1113 break;
1114 }
1115 case Intrinsic::experimental_vector_extract_last_active:
1116 if (ST->isSVEorStreamingSVEAvailable()) {
1117 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1118 // This should turn into chained clastb instructions.
1119 return LegalCost;
1120 }
1121 break;
1122 case Intrinsic::pow: {
1123 // For scalar calls we know the target has the libcall, and for fixed-width
1124 // vectors we know for the worst case it can be scalarised.
1125 EVT VT = getTLI()->getValueType(DL, RetTy);
1126 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1127 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1128 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1129
1130 // If we know that the call can be lowered with libcalls then it's safe to
1131 // reduce the costs in some cases. This is important for scalable vectors,
1132 // since we cannot scalarize the call in the absence of a vector math
1133 // library.
1134 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1135 // If we know the fast math flags and the exponent is a constant then the
1136 // cost may be less for some exponents like 0.25 and 0.75.
1137 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1138 if (ExpC && isa<VectorType>(ExpC->getType()))
1139 ExpC = ExpC->getSplatValue();
1140 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1141 // The argument must be a FP constant.
1142 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1143 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1144 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1145 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1146 (!Is025 || FMF.noSignedZeros())) {
1147 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1149 if (Is025)
1150 return 2 * Sqrt;
1152 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1153 return (Sqrt * 2) + FMul;
1154 }
1155 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1156 // cheaper than pow.
1157 }
1158 }
1159
1160 if (HasLibcall)
1161 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1162 break;
1163 }
1164 case Intrinsic::sqrt:
1165 case Intrinsic::fabs:
1166 case Intrinsic::ceil:
1167 case Intrinsic::floor:
1168 case Intrinsic::nearbyint:
1169 case Intrinsic::round:
1170 case Intrinsic::rint:
1171 case Intrinsic::roundeven:
1172 case Intrinsic::trunc:
1173 case Intrinsic::minnum:
1174 case Intrinsic::maxnum:
1175 case Intrinsic::minimum:
1176 case Intrinsic::maximum: {
1177 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1178 auto LT = getTypeLegalizationCost(RetTy);
1179 return LT.first;
1180 }
1181 break;
1182 }
1183 default:
1184 break;
1185 }
1187}
1188
1189/// The function will remove redundant reinterprets casting in the presence
1190/// of the control flow
1191static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1192 IntrinsicInst &II) {
1194 auto RequiredType = II.getType();
1195
1196 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1197 assert(PN && "Expected Phi Node!");
1198
1199 // Don't create a new Phi unless we can remove the old one.
1200 if (!PN->hasOneUse())
1201 return std::nullopt;
1202
1203 for (Value *IncValPhi : PN->incoming_values()) {
1204 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1205 if (!Reinterpret ||
1206 Reinterpret->getIntrinsicID() !=
1207 Intrinsic::aarch64_sve_convert_to_svbool ||
1208 RequiredType != Reinterpret->getArgOperand(0)->getType())
1209 return std::nullopt;
1210 }
1211
1212 // Create the new Phi
1213 IC.Builder.SetInsertPoint(PN);
1214 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1215 Worklist.push_back(PN);
1216
1217 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1218 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1219 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1220 Worklist.push_back(Reinterpret);
1221 }
1222
1223 // Cleanup Phi Node and reinterprets
1224 return IC.replaceInstUsesWith(II, NPN);
1225}
1226
1227// A collection of properties common to SVE intrinsics that allow for combines
1228// to be written without needing to know the specific intrinsic.
1230 //
1231 // Helper routines for common intrinsic definitions.
1232 //
1233
1234 // e.g. llvm.aarch64.sve.add pg, op1, op2
1235 // with IID ==> llvm.aarch64.sve.add_u
1236 static SVEIntrinsicInfo
1243
1244 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1251
1252 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1258
1259 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1265
1266 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1267 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1268 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1269 return SVEIntrinsicInfo()
1272 }
1273
1274 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1275 // llvm.aarch64.sve.ld1 pg, ptr
1282
1283 // All properties relate to predication and thus having a general predicate
1284 // is the minimum requirement to say there is intrinsic info to act on.
1285 explicit operator bool() const { return hasGoverningPredicate(); }
1286
1287 //
1288 // Properties relating to the governing predicate.
1289 //
1290
1292 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1293 }
1294
1296 assert(hasGoverningPredicate() && "Propery not set!");
1297 return GoverningPredicateIdx;
1298 }
1299
1301 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1302 GoverningPredicateIdx = Index;
1303 return *this;
1304 }
1305
1306 //
1307 // Properties relating to operations the intrinsic could be transformed into.
1308 // NOTE: This does not mean such a transformation is always possible, but the
1309 // knowledge makes it possible to reuse existing optimisations without needing
1310 // to embed specific handling for each intrinsic. For example, instruction
1311 // simplification can be used to optimise an intrinsic's active lanes.
1312 //
1313
1315 return UndefIntrinsic != Intrinsic::not_intrinsic;
1316 }
1317
1319 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1320 return UndefIntrinsic;
1321 }
1322
1324 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1325 UndefIntrinsic = IID;
1326 return *this;
1327 }
1328
1329 bool hasMatchingIROpode() const { return IROpcode != 0; }
1330
1331 unsigned getMatchingIROpode() const {
1332 assert(hasMatchingIROpode() && "Propery not set!");
1333 return IROpcode;
1334 }
1335
1337 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1338 IROpcode = Opcode;
1339 return *this;
1340 }
1341
1342 //
1343 // Properties relating to the result of inactive lanes.
1344 //
1345
1347 return ResultLanes == InactiveLanesTakenFromOperand;
1348 }
1349
1351 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1352 return OperandIdxForInactiveLanes;
1353 }
1354
1356 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1357 ResultLanes = InactiveLanesTakenFromOperand;
1358 OperandIdxForInactiveLanes = Index;
1359 return *this;
1360 }
1361
1363 return ResultLanes == InactiveLanesAreNotDefined;
1364 }
1365
1367 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1368 ResultLanes = InactiveLanesAreNotDefined;
1369 return *this;
1370 }
1371
1373 return ResultLanes == InactiveLanesAreUnused;
1374 }
1375
1377 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1378 ResultLanes = InactiveLanesAreUnused;
1379 return *this;
1380 }
1381
1382 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1383 // inactiveLanesAreZeroed =
1384 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1385 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1386
1388 ResultIsZeroInitialized = true;
1389 return *this;
1390 }
1391
1392 //
1393 // The first operand of unary merging operations is typically only used to
1394 // set the result for inactive lanes. Knowing this allows us to deadcode the
1395 // operand when we can prove there are no inactive lanes.
1396 //
1397
1399 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1400 }
1401
1403 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1404 return OperandIdxWithNoActiveLanes;
1405 }
1406
1408 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1409 OperandIdxWithNoActiveLanes = Index;
1410 return *this;
1411 }
1412
1413private:
1414 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1415
1416 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1417 unsigned IROpcode = 0;
1418
1419 enum PredicationStyle {
1421 InactiveLanesTakenFromOperand,
1422 InactiveLanesAreNotDefined,
1423 InactiveLanesAreUnused
1424 } ResultLanes = Uninitialized;
1425
1426 bool ResultIsZeroInitialized = false;
1427 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1428 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1429};
1430
1432 // Some SVE intrinsics do not use scalable vector types, but since they are
1433 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1434 if (!isa<ScalableVectorType>(II.getType()) &&
1435 all_of(II.args(), [&](const Value *V) {
1436 return !isa<ScalableVectorType>(V->getType());
1437 }))
1438 return SVEIntrinsicInfo();
1439
1440 Intrinsic::ID IID = II.getIntrinsicID();
1441 switch (IID) {
1442 default:
1443 break;
1444 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1445 case Intrinsic::aarch64_sve_fcvt_f16f32:
1446 case Intrinsic::aarch64_sve_fcvt_f16f64:
1447 case Intrinsic::aarch64_sve_fcvt_f32f16:
1448 case Intrinsic::aarch64_sve_fcvt_f32f64:
1449 case Intrinsic::aarch64_sve_fcvt_f64f16:
1450 case Intrinsic::aarch64_sve_fcvt_f64f32:
1451 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1452 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1453 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1454 case Intrinsic::aarch64_sve_fcvtzs:
1455 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1456 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1457 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1458 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1459 case Intrinsic::aarch64_sve_fcvtzu:
1460 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1461 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1462 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1463 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1464 case Intrinsic::aarch64_sve_revb:
1465 case Intrinsic::aarch64_sve_revh:
1466 case Intrinsic::aarch64_sve_revw:
1467 case Intrinsic::aarch64_sve_revd:
1468 case Intrinsic::aarch64_sve_scvtf:
1469 case Intrinsic::aarch64_sve_scvtf_f16i32:
1470 case Intrinsic::aarch64_sve_scvtf_f16i64:
1471 case Intrinsic::aarch64_sve_scvtf_f32i64:
1472 case Intrinsic::aarch64_sve_scvtf_f64i32:
1473 case Intrinsic::aarch64_sve_ucvtf:
1474 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1475 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1476 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1477 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1479
1480 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1481 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1482 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1483 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1485
1486 case Intrinsic::aarch64_sve_fabd:
1487 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1488 case Intrinsic::aarch64_sve_fadd:
1489 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1490 .setMatchingIROpcode(Instruction::FAdd);
1491 case Intrinsic::aarch64_sve_fdiv:
1492 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1493 .setMatchingIROpcode(Instruction::FDiv);
1494 case Intrinsic::aarch64_sve_fmax:
1495 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1496 case Intrinsic::aarch64_sve_fmaxnm:
1497 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1498 case Intrinsic::aarch64_sve_fmin:
1499 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1500 case Intrinsic::aarch64_sve_fminnm:
1501 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1502 case Intrinsic::aarch64_sve_fmla:
1503 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1504 case Intrinsic::aarch64_sve_fmls:
1505 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1506 case Intrinsic::aarch64_sve_fmul:
1507 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1508 .setMatchingIROpcode(Instruction::FMul);
1509 case Intrinsic::aarch64_sve_fmulx:
1510 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1511 case Intrinsic::aarch64_sve_fnmla:
1512 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1513 case Intrinsic::aarch64_sve_fnmls:
1514 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1515 case Intrinsic::aarch64_sve_fsub:
1516 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1517 .setMatchingIROpcode(Instruction::FSub);
1518 case Intrinsic::aarch64_sve_add:
1519 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1520 .setMatchingIROpcode(Instruction::Add);
1521 case Intrinsic::aarch64_sve_mla:
1522 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1523 case Intrinsic::aarch64_sve_mls:
1524 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1525 case Intrinsic::aarch64_sve_mul:
1526 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1527 .setMatchingIROpcode(Instruction::Mul);
1528 case Intrinsic::aarch64_sve_sabd:
1529 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1530 case Intrinsic::aarch64_sve_sdiv:
1531 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1532 .setMatchingIROpcode(Instruction::SDiv);
1533 case Intrinsic::aarch64_sve_smax:
1534 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1535 case Intrinsic::aarch64_sve_smin:
1536 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1537 case Intrinsic::aarch64_sve_smulh:
1538 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1539 case Intrinsic::aarch64_sve_sub:
1540 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1541 .setMatchingIROpcode(Instruction::Sub);
1542 case Intrinsic::aarch64_sve_uabd:
1543 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1544 case Intrinsic::aarch64_sve_udiv:
1545 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1546 .setMatchingIROpcode(Instruction::UDiv);
1547 case Intrinsic::aarch64_sve_umax:
1548 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1549 case Intrinsic::aarch64_sve_umin:
1550 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1551 case Intrinsic::aarch64_sve_umulh:
1552 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1553 case Intrinsic::aarch64_sve_asr:
1554 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1555 .setMatchingIROpcode(Instruction::AShr);
1556 case Intrinsic::aarch64_sve_lsl:
1557 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1558 .setMatchingIROpcode(Instruction::Shl);
1559 case Intrinsic::aarch64_sve_lsr:
1560 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1561 .setMatchingIROpcode(Instruction::LShr);
1562 case Intrinsic::aarch64_sve_and:
1563 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1564 .setMatchingIROpcode(Instruction::And);
1565 case Intrinsic::aarch64_sve_bic:
1566 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1567 case Intrinsic::aarch64_sve_eor:
1568 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1569 .setMatchingIROpcode(Instruction::Xor);
1570 case Intrinsic::aarch64_sve_orr:
1571 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1572 .setMatchingIROpcode(Instruction::Or);
1573 case Intrinsic::aarch64_sve_shsub:
1574 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1575 case Intrinsic::aarch64_sve_shsubr:
1577 case Intrinsic::aarch64_sve_sqrshl:
1578 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1579 case Intrinsic::aarch64_sve_sqshl:
1580 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1581 case Intrinsic::aarch64_sve_sqsub:
1582 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1583 case Intrinsic::aarch64_sve_srshl:
1584 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1585 case Intrinsic::aarch64_sve_uhsub:
1586 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1587 case Intrinsic::aarch64_sve_uhsubr:
1589 case Intrinsic::aarch64_sve_uqrshl:
1590 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1591 case Intrinsic::aarch64_sve_uqshl:
1592 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1593 case Intrinsic::aarch64_sve_uqsub:
1594 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1595 case Intrinsic::aarch64_sve_urshl:
1596 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1597
1598 case Intrinsic::aarch64_sve_add_u:
1600 Instruction::Add);
1601 case Intrinsic::aarch64_sve_and_u:
1603 Instruction::And);
1604 case Intrinsic::aarch64_sve_asr_u:
1606 Instruction::AShr);
1607 case Intrinsic::aarch64_sve_eor_u:
1609 Instruction::Xor);
1610 case Intrinsic::aarch64_sve_fadd_u:
1612 Instruction::FAdd);
1613 case Intrinsic::aarch64_sve_fdiv_u:
1615 Instruction::FDiv);
1616 case Intrinsic::aarch64_sve_fmul_u:
1618 Instruction::FMul);
1619 case Intrinsic::aarch64_sve_fsub_u:
1621 Instruction::FSub);
1622 case Intrinsic::aarch64_sve_lsl_u:
1624 Instruction::Shl);
1625 case Intrinsic::aarch64_sve_lsr_u:
1627 Instruction::LShr);
1628 case Intrinsic::aarch64_sve_mul_u:
1630 Instruction::Mul);
1631 case Intrinsic::aarch64_sve_orr_u:
1633 Instruction::Or);
1634 case Intrinsic::aarch64_sve_sdiv_u:
1636 Instruction::SDiv);
1637 case Intrinsic::aarch64_sve_sub_u:
1639 Instruction::Sub);
1640 case Intrinsic::aarch64_sve_udiv_u:
1642 Instruction::UDiv);
1643
1644 case Intrinsic::aarch64_sve_addqv:
1645 case Intrinsic::aarch64_sve_and_z:
1646 case Intrinsic::aarch64_sve_bic_z:
1647 case Intrinsic::aarch64_sve_brka_z:
1648 case Intrinsic::aarch64_sve_brkb_z:
1649 case Intrinsic::aarch64_sve_brkn_z:
1650 case Intrinsic::aarch64_sve_brkpa_z:
1651 case Intrinsic::aarch64_sve_brkpb_z:
1652 case Intrinsic::aarch64_sve_cntp:
1653 case Intrinsic::aarch64_sve_compact:
1654 case Intrinsic::aarch64_sve_eor_z:
1655 case Intrinsic::aarch64_sve_eorv:
1656 case Intrinsic::aarch64_sve_eorqv:
1657 case Intrinsic::aarch64_sve_nand_z:
1658 case Intrinsic::aarch64_sve_nor_z:
1659 case Intrinsic::aarch64_sve_orn_z:
1660 case Intrinsic::aarch64_sve_orr_z:
1661 case Intrinsic::aarch64_sve_orv:
1662 case Intrinsic::aarch64_sve_orqv:
1663 case Intrinsic::aarch64_sve_pnext:
1664 case Intrinsic::aarch64_sve_rdffr_z:
1665 case Intrinsic::aarch64_sve_saddv:
1666 case Intrinsic::aarch64_sve_uaddv:
1667 case Intrinsic::aarch64_sve_umaxv:
1668 case Intrinsic::aarch64_sve_umaxqv:
1669 case Intrinsic::aarch64_sve_cmpeq:
1670 case Intrinsic::aarch64_sve_cmpeq_wide:
1671 case Intrinsic::aarch64_sve_cmpge:
1672 case Intrinsic::aarch64_sve_cmpge_wide:
1673 case Intrinsic::aarch64_sve_cmpgt:
1674 case Intrinsic::aarch64_sve_cmpgt_wide:
1675 case Intrinsic::aarch64_sve_cmphi:
1676 case Intrinsic::aarch64_sve_cmphi_wide:
1677 case Intrinsic::aarch64_sve_cmphs:
1678 case Intrinsic::aarch64_sve_cmphs_wide:
1679 case Intrinsic::aarch64_sve_cmple_wide:
1680 case Intrinsic::aarch64_sve_cmplo_wide:
1681 case Intrinsic::aarch64_sve_cmpls_wide:
1682 case Intrinsic::aarch64_sve_cmplt_wide:
1683 case Intrinsic::aarch64_sve_cmpne:
1684 case Intrinsic::aarch64_sve_cmpne_wide:
1685 case Intrinsic::aarch64_sve_facge:
1686 case Intrinsic::aarch64_sve_facgt:
1687 case Intrinsic::aarch64_sve_fcmpeq:
1688 case Intrinsic::aarch64_sve_fcmpge:
1689 case Intrinsic::aarch64_sve_fcmpgt:
1690 case Intrinsic::aarch64_sve_fcmpne:
1691 case Intrinsic::aarch64_sve_fcmpuo:
1692 case Intrinsic::aarch64_sve_ld1:
1693 case Intrinsic::aarch64_sve_ld1_gather:
1694 case Intrinsic::aarch64_sve_ld1_gather_index:
1695 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1696 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1697 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1698 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1699 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1700 case Intrinsic::aarch64_sve_ld1q_gather_index:
1701 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1702 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1703 case Intrinsic::aarch64_sve_ld1ro:
1704 case Intrinsic::aarch64_sve_ld1rq:
1705 case Intrinsic::aarch64_sve_ld1udq:
1706 case Intrinsic::aarch64_sve_ld1uwq:
1707 case Intrinsic::aarch64_sve_ld2_sret:
1708 case Intrinsic::aarch64_sve_ld2q_sret:
1709 case Intrinsic::aarch64_sve_ld3_sret:
1710 case Intrinsic::aarch64_sve_ld3q_sret:
1711 case Intrinsic::aarch64_sve_ld4_sret:
1712 case Intrinsic::aarch64_sve_ld4q_sret:
1713 case Intrinsic::aarch64_sve_ldff1:
1714 case Intrinsic::aarch64_sve_ldff1_gather:
1715 case Intrinsic::aarch64_sve_ldff1_gather_index:
1716 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1717 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1718 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1719 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1720 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1721 case Intrinsic::aarch64_sve_ldnf1:
1722 case Intrinsic::aarch64_sve_ldnt1:
1723 case Intrinsic::aarch64_sve_ldnt1_gather:
1724 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1725 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1726 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1728
1729 case Intrinsic::aarch64_sve_prf:
1730 case Intrinsic::aarch64_sve_prfb_gather_index:
1731 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1732 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1733 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1734 case Intrinsic::aarch64_sve_prfd_gather_index:
1735 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1736 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1737 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1738 case Intrinsic::aarch64_sve_prfh_gather_index:
1739 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1740 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1741 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1742 case Intrinsic::aarch64_sve_prfw_gather_index:
1743 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1744 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1745 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1747
1748 case Intrinsic::aarch64_sve_st1_scatter:
1749 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1750 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1751 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1752 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1753 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1754 case Intrinsic::aarch64_sve_st1dq:
1755 case Intrinsic::aarch64_sve_st1q_scatter_index:
1756 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1757 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1758 case Intrinsic::aarch64_sve_st1wq:
1759 case Intrinsic::aarch64_sve_stnt1:
1760 case Intrinsic::aarch64_sve_stnt1_scatter:
1761 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1762 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1763 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1765 case Intrinsic::aarch64_sve_st2:
1766 case Intrinsic::aarch64_sve_st2q:
1768 case Intrinsic::aarch64_sve_st3:
1769 case Intrinsic::aarch64_sve_st3q:
1771 case Intrinsic::aarch64_sve_st4:
1772 case Intrinsic::aarch64_sve_st4q:
1774 }
1775
1776 return SVEIntrinsicInfo();
1777}
1778
1779static bool isAllActivePredicate(Value *Pred) {
1780 Value *UncastedPred;
1781
1782 // Look through predicate casts that only remove lanes.
1784 m_Value(UncastedPred)))) {
1785 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1786 Pred = UncastedPred;
1787
1789 m_Value(UncastedPred))))
1790 // If the predicate has the same or less lanes than the uncasted predicate
1791 // then we know the casting has no effect.
1792 if (OrigPredTy->getMinNumElements() <=
1793 cast<ScalableVectorType>(UncastedPred->getType())
1794 ->getMinNumElements())
1795 Pred = UncastedPred;
1796 }
1797
1798 auto *C = dyn_cast<Constant>(Pred);
1799 return C && C->isAllOnesValue();
1800}
1801
1802// Simplify `V` by only considering the operations that affect active lanes.
1803// This function should only return existing Values or newly created Constants.
1804static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1805 auto *Dup = dyn_cast<IntrinsicInst>(V);
1806 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1807 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1809 cast<VectorType>(V->getType())->getElementCount(),
1810 cast<Constant>(Dup->getOperand(2)));
1811
1812 return V;
1813}
1814
1815static std::optional<Instruction *>
1817 const SVEIntrinsicInfo &IInfo) {
1818 const unsigned Opc = IInfo.getMatchingIROpode();
1819 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1820
1821 Value *Pg = II.getOperand(0);
1822 Value *Op1 = II.getOperand(1);
1823 Value *Op2 = II.getOperand(2);
1824 const DataLayout &DL = II.getDataLayout();
1825
1826 // Canonicalise constants to the RHS.
1828 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1829 IC.replaceOperand(II, 1, Op2);
1830 IC.replaceOperand(II, 2, Op1);
1831 return &II;
1832 }
1833
1834 // Only active lanes matter when simplifying the operation.
1835 Op1 = stripInactiveLanes(Op1, Pg);
1836 Op2 = stripInactiveLanes(Op2, Pg);
1837
1838 Value *SimpleII;
1839 if (auto FII = dyn_cast<FPMathOperator>(&II))
1840 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1841 else
1842 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1843
1844 // An SVE intrinsic's result is always defined. However, this is not the case
1845 // for its equivalent IR instruction (e.g. when shifting by an amount more
1846 // than the data's bitwidth). Simplifications to an undefined result must be
1847 // ignored to preserve the intrinsic's expected behaviour.
1848 if (!SimpleII || isa<UndefValue>(SimpleII))
1849 return std::nullopt;
1850
1851 if (IInfo.inactiveLanesAreNotDefined())
1852 return IC.replaceInstUsesWith(II, SimpleII);
1853
1854 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1855
1856 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1857 if (SimpleII == Inactive)
1858 return IC.replaceInstUsesWith(II, SimpleII);
1859
1860 // Inactive lanes must be preserved.
1861 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1862 return IC.replaceInstUsesWith(II, SimpleII);
1863}
1864
1865// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1866// to operations with less strict inactive lane requirements.
1867static std::optional<Instruction *>
1869 const SVEIntrinsicInfo &IInfo) {
1870 if (!IInfo.hasGoverningPredicate())
1871 return std::nullopt;
1872
1873 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1874
1875 // If there are no active lanes.
1876 if (match(OpPredicate, m_ZeroInt())) {
1878 return IC.replaceInstUsesWith(
1879 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1880
1881 if (IInfo.inactiveLanesAreUnused()) {
1882 if (IInfo.resultIsZeroInitialized())
1884
1885 return IC.eraseInstFromFunction(II);
1886 }
1887 }
1888
1889 // If there are no inactive lanes.
1890 if (isAllActivePredicate(OpPredicate)) {
1891 if (IInfo.hasOperandWithNoActiveLanes()) {
1892 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1893 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1894 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1895 }
1896
1897 if (IInfo.hasMatchingUndefIntrinsic()) {
1898 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1899 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1900 II.setCalledFunction(NewDecl);
1901 return &II;
1902 }
1903 }
1904
1905 // Operation specific simplifications.
1906 if (IInfo.hasMatchingIROpode() &&
1908 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1909
1910 return std::nullopt;
1911}
1912
1913// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1914// => (binop (pred) (from_svbool _) (from_svbool _))
1915//
1916// The above transformation eliminates a `to_svbool` in the predicate
1917// operand of bitwise operation `binop` by narrowing the vector width of
1918// the operation. For example, it would convert a `<vscale x 16 x i1>
1919// and` into a `<vscale x 4 x i1> and`. This is profitable because
1920// to_svbool must zero the new lanes during widening, whereas
1921// from_svbool is free.
1922static std::optional<Instruction *>
1924 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1925 if (!BinOp)
1926 return std::nullopt;
1927
1928 auto IntrinsicID = BinOp->getIntrinsicID();
1929 switch (IntrinsicID) {
1930 case Intrinsic::aarch64_sve_and_z:
1931 case Intrinsic::aarch64_sve_bic_z:
1932 case Intrinsic::aarch64_sve_eor_z:
1933 case Intrinsic::aarch64_sve_nand_z:
1934 case Intrinsic::aarch64_sve_nor_z:
1935 case Intrinsic::aarch64_sve_orn_z:
1936 case Intrinsic::aarch64_sve_orr_z:
1937 break;
1938 default:
1939 return std::nullopt;
1940 }
1941
1942 auto BinOpPred = BinOp->getOperand(0);
1943 auto BinOpOp1 = BinOp->getOperand(1);
1944 auto BinOpOp2 = BinOp->getOperand(2);
1945
1946 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1947 if (!PredIntr ||
1948 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1949 return std::nullopt;
1950
1951 auto PredOp = PredIntr->getOperand(0);
1952 auto PredOpTy = cast<VectorType>(PredOp->getType());
1953 if (PredOpTy != II.getType())
1954 return std::nullopt;
1955
1956 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1957 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1958 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1959 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1960 if (BinOpOp1 == BinOpOp2)
1961 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1962 else
1963 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1964 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1965
1966 auto NarrowedBinOp =
1967 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1968 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1969}
1970
1971static std::optional<Instruction *>
1973 // If the reinterpret instruction operand is a PHI Node
1974 if (isa<PHINode>(II.getArgOperand(0)))
1975 return processPhiNode(IC, II);
1976
1977 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1978 return BinOpCombine;
1979
1980 // Ignore converts to/from svcount_t.
1981 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1982 isa<TargetExtType>(II.getType()))
1983 return std::nullopt;
1984
1985 SmallVector<Instruction *, 32> CandidatesForRemoval;
1986 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1987
1988 const auto *IVTy = cast<VectorType>(II.getType());
1989
1990 // Walk the chain of conversions.
1991 while (Cursor) {
1992 // If the type of the cursor has fewer lanes than the final result, zeroing
1993 // must take place, which breaks the equivalence chain.
1994 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1995 if (CursorVTy->getElementCount().getKnownMinValue() <
1996 IVTy->getElementCount().getKnownMinValue())
1997 break;
1998
1999 // If the cursor has the same type as I, it is a viable replacement.
2000 if (Cursor->getType() == IVTy)
2001 EarliestReplacement = Cursor;
2002
2003 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2004
2005 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2006 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2007 Intrinsic::aarch64_sve_convert_to_svbool ||
2008 IntrinsicCursor->getIntrinsicID() ==
2009 Intrinsic::aarch64_sve_convert_from_svbool))
2010 break;
2011
2012 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2013 Cursor = IntrinsicCursor->getOperand(0);
2014 }
2015
2016 // If no viable replacement in the conversion chain was found, there is
2017 // nothing to do.
2018 if (!EarliestReplacement)
2019 return std::nullopt;
2020
2021 return IC.replaceInstUsesWith(II, EarliestReplacement);
2022}
2023
2024static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2025 IntrinsicInst &II) {
2026 // svsel(ptrue, x, y) => x
2027 auto *OpPredicate = II.getOperand(0);
2028 if (isAllActivePredicate(OpPredicate))
2029 return IC.replaceInstUsesWith(II, II.getOperand(1));
2030
2031 auto Select =
2032 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2033 return IC.replaceInstUsesWith(II, Select);
2034}
2035
2036static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2037 IntrinsicInst &II) {
2038 Value *Pg = II.getOperand(1);
2039
2040 // sve.dup(V, all_active, X) ==> splat(X)
2041 if (isAllActivePredicate(Pg)) {
2042 auto *RetTy = cast<ScalableVectorType>(II.getType());
2043 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2044 II.getArgOperand(2));
2045 return IC.replaceInstUsesWith(II, Splat);
2046 }
2047
2049 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2050 return std::nullopt;
2051
2052 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2053 Value *Insert = IC.Builder.CreateInsertElement(
2054 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2055 return IC.replaceInstUsesWith(II, Insert);
2056}
2057
2058static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2059 IntrinsicInst &II) {
2060 // Replace DupX with a regular IR splat.
2061 auto *RetTy = cast<ScalableVectorType>(II.getType());
2062 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2063 II.getArgOperand(0));
2064 Splat->takeName(&II);
2065 return IC.replaceInstUsesWith(II, Splat);
2066}
2067
2068static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2069 IntrinsicInst &II) {
2070 LLVMContext &Ctx = II.getContext();
2071
2072 if (!isAllActivePredicate(II.getArgOperand(0)))
2073 return std::nullopt;
2074
2075 // Check that we have a compare of zero..
2076 auto *SplatValue =
2078 if (!SplatValue || !SplatValue->isZero())
2079 return std::nullopt;
2080
2081 // ..against a dupq
2082 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2083 if (!DupQLane ||
2084 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2085 return std::nullopt;
2086
2087 // Where the dupq is a lane 0 replicate of a vector insert
2088 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2089 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2090 return std::nullopt;
2091
2092 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2093 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2094 return std::nullopt;
2095
2096 // Where the vector insert is a fixed constant vector insert into undef at
2097 // index zero
2098 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2099 return std::nullopt;
2100
2101 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2102 return std::nullopt;
2103
2104 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2105 if (!ConstVec)
2106 return std::nullopt;
2107
2108 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2109 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2110 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2111 return std::nullopt;
2112
2113 unsigned NumElts = VecTy->getNumElements();
2114 unsigned PredicateBits = 0;
2115
2116 // Expand intrinsic operands to a 16-bit byte level predicate
2117 for (unsigned I = 0; I < NumElts; ++I) {
2118 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2119 if (!Arg)
2120 return std::nullopt;
2121 if (!Arg->isZero())
2122 PredicateBits |= 1 << (I * (16 / NumElts));
2123 }
2124
2125 // If all bits are zero bail early with an empty predicate
2126 if (PredicateBits == 0) {
2127 auto *PFalse = Constant::getNullValue(II.getType());
2128 PFalse->takeName(&II);
2129 return IC.replaceInstUsesWith(II, PFalse);
2130 }
2131
2132 // Calculate largest predicate type used (where byte predicate is largest)
2133 unsigned Mask = 8;
2134 for (unsigned I = 0; I < 16; ++I)
2135 if ((PredicateBits & (1 << I)) != 0)
2136 Mask |= (I % 8);
2137
2138 unsigned PredSize = Mask & -Mask;
2139 auto *PredType = ScalableVectorType::get(
2140 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2141
2142 // Ensure all relevant bits are set
2143 for (unsigned I = 0; I < 16; I += PredSize)
2144 if ((PredicateBits & (1 << I)) == 0)
2145 return std::nullopt;
2146
2147 auto *PTruePat =
2148 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2149 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2150 {PredType}, {PTruePat});
2151 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2152 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2153 auto *ConvertFromSVBool =
2154 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2155 {II.getType()}, {ConvertToSVBool});
2156
2157 ConvertFromSVBool->takeName(&II);
2158 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2159}
2160
2161static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2162 IntrinsicInst &II) {
2163 Value *Pg = II.getArgOperand(0);
2164 Value *Vec = II.getArgOperand(1);
2165 auto IntrinsicID = II.getIntrinsicID();
2166 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2167
2168 // lastX(splat(X)) --> X
2169 if (auto *SplatVal = getSplatValue(Vec))
2170 return IC.replaceInstUsesWith(II, SplatVal);
2171
2172 // If x and/or y is a splat value then:
2173 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2174 Value *LHS, *RHS;
2175 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2176 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2177 auto *OldBinOp = cast<BinaryOperator>(Vec);
2178 auto OpC = OldBinOp->getOpcode();
2179 auto *NewLHS =
2180 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2181 auto *NewRHS =
2182 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2184 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2185 return IC.replaceInstUsesWith(II, NewBinOp);
2186 }
2187 }
2188
2189 auto *C = dyn_cast<Constant>(Pg);
2190 if (IsAfter && C && C->isNullValue()) {
2191 // The intrinsic is extracting lane 0 so use an extract instead.
2192 auto *IdxTy = Type::getInt64Ty(II.getContext());
2193 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2194 Extract->insertBefore(II.getIterator());
2195 Extract->takeName(&II);
2196 return IC.replaceInstUsesWith(II, Extract);
2197 }
2198
2199 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2200 if (!IntrPG)
2201 return std::nullopt;
2202
2203 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2204 return std::nullopt;
2205
2206 const auto PTruePattern =
2207 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2208
2209 // Can the intrinsic's predicate be converted to a known constant index?
2210 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2211 if (!MinNumElts)
2212 return std::nullopt;
2213
2214 unsigned Idx = MinNumElts - 1;
2215 // Increment the index if extracting the element after the last active
2216 // predicate element.
2217 if (IsAfter)
2218 ++Idx;
2219
2220 // Ignore extracts whose index is larger than the known minimum vector
2221 // length. NOTE: This is an artificial constraint where we prefer to
2222 // maintain what the user asked for until an alternative is proven faster.
2223 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2224 if (Idx >= PgVTy->getMinNumElements())
2225 return std::nullopt;
2226
2227 // The intrinsic is extracting a fixed lane so use an extract instead.
2228 auto *IdxTy = Type::getInt64Ty(II.getContext());
2229 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2230 Extract->insertBefore(II.getIterator());
2231 Extract->takeName(&II);
2232 return IC.replaceInstUsesWith(II, Extract);
2233}
2234
2235static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2236 IntrinsicInst &II) {
2237 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2238 // integer variant across a variety of micro-architectures. Replace scalar
2239 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2240 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2241 // depending on the micro-architecture, but has been observed as generally
2242 // being faster, particularly when the CLAST[AB] op is a loop-carried
2243 // dependency.
2244 Value *Pg = II.getArgOperand(0);
2245 Value *Fallback = II.getArgOperand(1);
2246 Value *Vec = II.getArgOperand(2);
2247 Type *Ty = II.getType();
2248
2249 if (!Ty->isIntegerTy())
2250 return std::nullopt;
2251
2252 Type *FPTy;
2253 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2254 default:
2255 return std::nullopt;
2256 case 16:
2257 FPTy = IC.Builder.getHalfTy();
2258 break;
2259 case 32:
2260 FPTy = IC.Builder.getFloatTy();
2261 break;
2262 case 64:
2263 FPTy = IC.Builder.getDoubleTy();
2264 break;
2265 }
2266
2267 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2268 auto *FPVTy = VectorType::get(
2269 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2270 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2271 auto *FPII = IC.Builder.CreateIntrinsic(
2272 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2273 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2274 return IC.replaceInstUsesWith(II, FPIItoInt);
2275}
2276
2277static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2278 IntrinsicInst &II) {
2279 LLVMContext &Ctx = II.getContext();
2280 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2281 // can work with RDFFR_PP for ptest elimination.
2282 auto *AllPat =
2283 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2284 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2285 {II.getType()}, {AllPat});
2286 auto *RDFFR =
2287 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2288 RDFFR->takeName(&II);
2289 return IC.replaceInstUsesWith(II, RDFFR);
2290}
2291
2292static std::optional<Instruction *>
2294 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2295
2296 if (Pattern == AArch64SVEPredPattern::all) {
2298 II.getType(), ElementCount::getScalable(NumElts));
2299 Cnt->takeName(&II);
2300 return IC.replaceInstUsesWith(II, Cnt);
2301 }
2302
2303 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2304
2305 return MinNumElts && NumElts >= MinNumElts
2306 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2307 II, ConstantInt::get(II.getType(), MinNumElts)))
2308 : std::nullopt;
2309}
2310
2311static std::optional<Instruction *>
2313 const AArch64Subtarget *ST) {
2314 if (!ST->isStreaming())
2315 return std::nullopt;
2316
2317 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2318 // with SVEPredPattern::all
2319 Value *Cnt =
2321 Cnt->takeName(&II);
2322 return IC.replaceInstUsesWith(II, Cnt);
2323}
2324
2325static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2326 IntrinsicInst &II) {
2327 Value *PgVal = II.getArgOperand(0);
2328 Value *OpVal = II.getArgOperand(1);
2329
2330 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2331 // Later optimizations prefer this form.
2332 if (PgVal == OpVal &&
2333 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2334 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2335 Value *Ops[] = {PgVal, OpVal};
2336 Type *Tys[] = {PgVal->getType()};
2337
2338 auto *PTest =
2339 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2340 PTest->takeName(&II);
2341
2342 return IC.replaceInstUsesWith(II, PTest);
2343 }
2344
2347
2348 if (!Pg || !Op)
2349 return std::nullopt;
2350
2351 Intrinsic::ID OpIID = Op->getIntrinsicID();
2352
2353 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2354 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2355 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2356 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2357 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2358
2359 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2360
2361 PTest->takeName(&II);
2362 return IC.replaceInstUsesWith(II, PTest);
2363 }
2364
2365 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2366 // Later optimizations may rewrite sequence to use the flag-setting variant
2367 // of instruction X to remove PTEST.
2368 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2369 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2370 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2371 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2372 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2373 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2374 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2375 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2376 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2377 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2378 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2379 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2380 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2381 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2382 Type *Tys[] = {Pg->getType()};
2383
2384 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2385 PTest->takeName(&II);
2386
2387 return IC.replaceInstUsesWith(II, PTest);
2388 }
2389
2390 return std::nullopt;
2391}
2392
2393template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2394static std::optional<Instruction *>
2396 bool MergeIntoAddendOp) {
2397 Value *P = II.getOperand(0);
2398 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2399 if (MergeIntoAddendOp) {
2400 AddendOp = II.getOperand(1);
2401 Mul = II.getOperand(2);
2402 } else {
2403 AddendOp = II.getOperand(2);
2404 Mul = II.getOperand(1);
2405 }
2406
2408 m_Value(MulOp1))))
2409 return std::nullopt;
2410
2411 if (!Mul->hasOneUse())
2412 return std::nullopt;
2413
2414 Instruction *FMFSource = nullptr;
2415 if (II.getType()->isFPOrFPVectorTy()) {
2416 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2417 // Stop the combine when the flags on the inputs differ in case dropping
2418 // flags would lead to us missing out on more beneficial optimizations.
2419 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2420 return std::nullopt;
2421 if (!FAddFlags.allowContract())
2422 return std::nullopt;
2423 FMFSource = &II;
2424 }
2425
2426 CallInst *Res;
2427 if (MergeIntoAddendOp)
2428 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2429 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2430 else
2431 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2432 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2433
2434 return IC.replaceInstUsesWith(II, Res);
2435}
2436
2437static std::optional<Instruction *>
2439 Value *Pred = II.getOperand(0);
2440 Value *PtrOp = II.getOperand(1);
2441 Type *VecTy = II.getType();
2442
2443 if (isAllActivePredicate(Pred)) {
2444 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2445 Load->copyMetadata(II);
2446 return IC.replaceInstUsesWith(II, Load);
2447 }
2448
2449 CallInst *MaskedLoad =
2450 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2451 Pred, ConstantAggregateZero::get(VecTy));
2452 MaskedLoad->copyMetadata(II);
2453 return IC.replaceInstUsesWith(II, MaskedLoad);
2454}
2455
2456static std::optional<Instruction *>
2458 Value *VecOp = II.getOperand(0);
2459 Value *Pred = II.getOperand(1);
2460 Value *PtrOp = II.getOperand(2);
2461
2462 if (isAllActivePredicate(Pred)) {
2463 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2464 Store->copyMetadata(II);
2465 return IC.eraseInstFromFunction(II);
2466 }
2467
2468 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2469 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2470 MaskedStore->copyMetadata(II);
2471 return IC.eraseInstFromFunction(II);
2472}
2473
2475 switch (Intrinsic) {
2476 case Intrinsic::aarch64_sve_fmul_u:
2477 return Instruction::BinaryOps::FMul;
2478 case Intrinsic::aarch64_sve_fadd_u:
2479 return Instruction::BinaryOps::FAdd;
2480 case Intrinsic::aarch64_sve_fsub_u:
2481 return Instruction::BinaryOps::FSub;
2482 default:
2483 return Instruction::BinaryOpsEnd;
2484 }
2485}
2486
2487static std::optional<Instruction *>
2489 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2490 if (II.isStrictFP())
2491 return std::nullopt;
2492
2493 auto *OpPredicate = II.getOperand(0);
2494 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2495 if (BinOpCode == Instruction::BinaryOpsEnd ||
2496 !isAllActivePredicate(OpPredicate))
2497 return std::nullopt;
2498 auto BinOp = IC.Builder.CreateBinOpFMF(
2499 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2500 return IC.replaceInstUsesWith(II, BinOp);
2501}
2502
2503static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2504 IntrinsicInst &II) {
2505 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2506 Intrinsic::aarch64_sve_mla>(
2507 IC, II, true))
2508 return MLA;
2509 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2510 Intrinsic::aarch64_sve_mad>(
2511 IC, II, false))
2512 return MAD;
2513 return std::nullopt;
2514}
2515
2516static std::optional<Instruction *>
2518 if (auto FMLA =
2519 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2520 Intrinsic::aarch64_sve_fmla>(IC, II,
2521 true))
2522 return FMLA;
2523 if (auto FMAD =
2524 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2525 Intrinsic::aarch64_sve_fmad>(IC, II,
2526 false))
2527 return FMAD;
2528 if (auto FMLA =
2529 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2530 Intrinsic::aarch64_sve_fmla>(IC, II,
2531 true))
2532 return FMLA;
2533 return std::nullopt;
2534}
2535
2536static std::optional<Instruction *>
2538 if (auto FMLA =
2539 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2540 Intrinsic::aarch64_sve_fmla>(IC, II,
2541 true))
2542 return FMLA;
2543 if (auto FMAD =
2544 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2545 Intrinsic::aarch64_sve_fmad>(IC, II,
2546 false))
2547 return FMAD;
2548 if (auto FMLA_U =
2549 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2550 Intrinsic::aarch64_sve_fmla_u>(
2551 IC, II, true))
2552 return FMLA_U;
2553 return instCombineSVEVectorBinOp(IC, II);
2554}
2555
2556static std::optional<Instruction *>
2558 if (auto FMLS =
2559 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2560 Intrinsic::aarch64_sve_fmls>(IC, II,
2561 true))
2562 return FMLS;
2563 if (auto FMSB =
2564 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2565 Intrinsic::aarch64_sve_fnmsb>(
2566 IC, II, false))
2567 return FMSB;
2568 if (auto FMLS =
2569 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2570 Intrinsic::aarch64_sve_fmls>(IC, II,
2571 true))
2572 return FMLS;
2573 return std::nullopt;
2574}
2575
2576static std::optional<Instruction *>
2578 if (auto FMLS =
2579 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2580 Intrinsic::aarch64_sve_fmls>(IC, II,
2581 true))
2582 return FMLS;
2583 if (auto FMSB =
2584 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2585 Intrinsic::aarch64_sve_fnmsb>(
2586 IC, II, false))
2587 return FMSB;
2588 if (auto FMLS_U =
2589 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2590 Intrinsic::aarch64_sve_fmls_u>(
2591 IC, II, true))
2592 return FMLS_U;
2593 return instCombineSVEVectorBinOp(IC, II);
2594}
2595
2596static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2597 IntrinsicInst &II) {
2598 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2599 Intrinsic::aarch64_sve_mls>(
2600 IC, II, true))
2601 return MLS;
2602 return std::nullopt;
2603}
2604
2605static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2606 IntrinsicInst &II) {
2607 Value *UnpackArg = II.getArgOperand(0);
2608 auto *RetTy = cast<ScalableVectorType>(II.getType());
2609 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2610 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2611
2612 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2613 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2614 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2615 ScalarArg =
2616 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2617 Value *NewVal =
2618 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2619 NewVal->takeName(&II);
2620 return IC.replaceInstUsesWith(II, NewVal);
2621 }
2622
2623 return std::nullopt;
2624}
2625static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2626 IntrinsicInst &II) {
2627 auto *OpVal = II.getOperand(0);
2628 auto *OpIndices = II.getOperand(1);
2629 VectorType *VTy = cast<VectorType>(II.getType());
2630
2631 // Check whether OpIndices is a constant splat value < minimal element count
2632 // of result.
2633 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2634 if (!SplatValue ||
2635 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2636 return std::nullopt;
2637
2638 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2639 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2640 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2641 auto *VectorSplat =
2642 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2643
2644 VectorSplat->takeName(&II);
2645 return IC.replaceInstUsesWith(II, VectorSplat);
2646}
2647
2648static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2649 IntrinsicInst &II) {
2650 Value *A, *B;
2651 Type *RetTy = II.getType();
2652 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2653 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2654
2655 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2656 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2657 if ((match(II.getArgOperand(0),
2659 match(II.getArgOperand(1),
2661 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2662 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2663 auto *TyA = cast<ScalableVectorType>(A->getType());
2664 if (TyA == B->getType() &&
2666 auto *SubVec = IC.Builder.CreateInsertVector(
2667 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2668 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2669 TyA->getMinNumElements());
2670 ConcatVec->takeName(&II);
2671 return IC.replaceInstUsesWith(II, ConcatVec);
2672 }
2673 }
2674
2675 return std::nullopt;
2676}
2677
2678static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2679 IntrinsicInst &II) {
2680 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2681 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2682 Value *A, *B;
2683 if (match(II.getArgOperand(0),
2686 m_Specific(A), m_Specific(B))))
2687 return IC.replaceInstUsesWith(
2688 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2689
2690 return std::nullopt;
2691}
2692
2693static std::optional<Instruction *>
2695 Value *Mask = II.getOperand(0);
2696 Value *BasePtr = II.getOperand(1);
2697 Value *Index = II.getOperand(2);
2698 Type *Ty = II.getType();
2699 Value *PassThru = ConstantAggregateZero::get(Ty);
2700
2701 // Contiguous gather => masked load.
2702 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2703 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2704 Value *IndexBase;
2706 m_Value(IndexBase), m_SpecificInt(1)))) {
2707 Align Alignment =
2708 BasePtr->getPointerAlignment(II.getDataLayout());
2709
2710 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2711 BasePtr, IndexBase);
2712 CallInst *MaskedLoad =
2713 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2714 MaskedLoad->takeName(&II);
2715 return IC.replaceInstUsesWith(II, MaskedLoad);
2716 }
2717
2718 return std::nullopt;
2719}
2720
2721static std::optional<Instruction *>
2723 Value *Val = II.getOperand(0);
2724 Value *Mask = II.getOperand(1);
2725 Value *BasePtr = II.getOperand(2);
2726 Value *Index = II.getOperand(3);
2727 Type *Ty = Val->getType();
2728
2729 // Contiguous scatter => masked store.
2730 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2731 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2732 Value *IndexBase;
2734 m_Value(IndexBase), m_SpecificInt(1)))) {
2735 Align Alignment =
2736 BasePtr->getPointerAlignment(II.getDataLayout());
2737
2738 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2739 BasePtr, IndexBase);
2740 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2741
2742 return IC.eraseInstFromFunction(II);
2743 }
2744
2745 return std::nullopt;
2746}
2747
2748static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2749 IntrinsicInst &II) {
2751 Value *Pred = II.getOperand(0);
2752 Value *Vec = II.getOperand(1);
2753 Value *DivVec = II.getOperand(2);
2754
2755 Value *SplatValue = getSplatValue(DivVec);
2756 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2757 if (!SplatConstantInt)
2758 return std::nullopt;
2759
2760 APInt Divisor = SplatConstantInt->getValue();
2761 const int64_t DivisorValue = Divisor.getSExtValue();
2762 if (DivisorValue == -1)
2763 return std::nullopt;
2764 if (DivisorValue == 1)
2765 IC.replaceInstUsesWith(II, Vec);
2766
2767 if (Divisor.isPowerOf2()) {
2768 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2769 auto ASRD = IC.Builder.CreateIntrinsic(
2770 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2771 return IC.replaceInstUsesWith(II, ASRD);
2772 }
2773 if (Divisor.isNegatedPowerOf2()) {
2774 Divisor.negate();
2775 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2776 auto ASRD = IC.Builder.CreateIntrinsic(
2777 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2778 auto NEG = IC.Builder.CreateIntrinsic(
2779 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2780 return IC.replaceInstUsesWith(II, NEG);
2781 }
2782
2783 return std::nullopt;
2784}
2785
2786bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2787 size_t VecSize = Vec.size();
2788 if (VecSize == 1)
2789 return true;
2790 if (!isPowerOf2_64(VecSize))
2791 return false;
2792 size_t HalfVecSize = VecSize / 2;
2793
2794 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2795 RHS != Vec.end(); LHS++, RHS++) {
2796 if (*LHS != nullptr && *RHS != nullptr) {
2797 if (*LHS == *RHS)
2798 continue;
2799 else
2800 return false;
2801 }
2802 if (!AllowPoison)
2803 return false;
2804 if (*LHS == nullptr && *RHS != nullptr)
2805 *LHS = *RHS;
2806 }
2807
2808 Vec.resize(HalfVecSize);
2809 SimplifyValuePattern(Vec, AllowPoison);
2810 return true;
2811}
2812
2813// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2814// to dupqlane(f64(C)) where C is A concatenated with B
2815static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2816 IntrinsicInst &II) {
2817 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2818 if (!match(II.getOperand(0),
2820 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2821 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2822 return std::nullopt;
2823 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2824
2825 // Insert the scalars into a container ordered by InsertElement index
2826 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2827 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2828 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2829 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2830 CurrentInsertElt = InsertElt->getOperand(0);
2831 }
2832
2833 bool AllowPoison =
2834 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2835 if (!SimplifyValuePattern(Elts, AllowPoison))
2836 return std::nullopt;
2837
2838 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2839 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2840 for (size_t I = 0; I < Elts.size(); I++) {
2841 if (Elts[I] == nullptr)
2842 continue;
2843 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2844 IC.Builder.getInt64(I));
2845 }
2846 if (InsertEltChain == nullptr)
2847 return std::nullopt;
2848
2849 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2850 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2851 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2852 // be narrowed back to the original type.
2853 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2854 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2855 IIScalableTy->getMinNumElements() /
2856 PatternWidth;
2857
2858 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2859 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2860 auto *WideShuffleMaskTy =
2861 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2862
2863 auto InsertSubvector = IC.Builder.CreateInsertVector(
2864 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2865 uint64_t(0));
2866 auto WideBitcast =
2867 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2868 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2869 auto WideShuffle = IC.Builder.CreateShuffleVector(
2870 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2871 auto NarrowBitcast =
2872 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2873
2874 return IC.replaceInstUsesWith(II, NarrowBitcast);
2875}
2876
2877static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2878 IntrinsicInst &II) {
2879 Value *A = II.getArgOperand(0);
2880 Value *B = II.getArgOperand(1);
2881 if (A == B)
2882 return IC.replaceInstUsesWith(II, A);
2883
2884 return std::nullopt;
2885}
2886
2887static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2888 IntrinsicInst &II) {
2889 Value *Pred = II.getOperand(0);
2890 Value *Vec = II.getOperand(1);
2891 Value *Shift = II.getOperand(2);
2892
2893 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2894 Value *AbsPred, *MergedValue;
2896 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2898 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2899
2900 return std::nullopt;
2901
2902 // Transform is valid if any of the following are true:
2903 // * The ABS merge value is an undef or non-negative
2904 // * The ABS predicate is all active
2905 // * The ABS predicate and the SRSHL predicates are the same
2906 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2907 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2908 return std::nullopt;
2909
2910 // Only valid when the shift amount is non-negative, otherwise the rounding
2911 // behaviour of SRSHL cannot be ignored.
2912 if (!match(Shift, m_NonNegative()))
2913 return std::nullopt;
2914
2915 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2916 {II.getType()}, {Pred, Vec, Shift});
2917
2918 return IC.replaceInstUsesWith(II, LSL);
2919}
2920
2921static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2922 IntrinsicInst &II) {
2923 Value *Vec = II.getOperand(0);
2924
2925 if (getSplatValue(Vec) == II.getOperand(1))
2926 return IC.replaceInstUsesWith(II, Vec);
2927
2928 return std::nullopt;
2929}
2930
2931static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2932 IntrinsicInst &II) {
2933 // If this barrier is post-dominated by identical one we can remove it
2934 auto *NI = II.getNextNode();
2935 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2936 auto CanSkipOver = [](Instruction *I) {
2937 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2938 };
2939 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2940 auto *NIBB = NI->getParent();
2941 NI = NI->getNextNode();
2942 if (!NI) {
2943 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2944 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2945 else
2946 break;
2947 }
2948 }
2949 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2950 if (NextII && II.isIdenticalTo(NextII))
2951 return IC.eraseInstFromFunction(II);
2952
2953 return std::nullopt;
2954}
2955
2956static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2957 IntrinsicInst &II) {
2958 return IC.replaceInstUsesWith(
2959 II,
2960 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2961 {II.getType(), II.getOperand(0)->getType()},
2962 {II.getOperand(0), II.getOperand(1)}));
2963}
2964
2965static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2966 IntrinsicInst &II) {
2968 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2969 return std::nullopt;
2970}
2971
2972static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2974 unsigned NumBits) {
2975 Value *Passthru = II.getOperand(0);
2976 Value *Pg = II.getOperand(1);
2977 Value *Op = II.getOperand(2);
2978
2979 // Convert UXT[BHW] to AND.
2980 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2981 auto *Ty = cast<VectorType>(II.getType());
2982 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2983 auto *Mask = ConstantInt::get(Ty, MaskValue);
2984 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2985 {Pg, Op, Mask});
2986 return IC.replaceInstUsesWith(II, And);
2987 }
2988
2989 return std::nullopt;
2990}
2991
2992static std::optional<Instruction *>
2994 SMEAttrs FnSMEAttrs(*II.getFunction());
2995 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2996 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2997 return IC.replaceInstUsesWith(
2998 II, ConstantInt::getBool(II.getType(), IsStreaming));
2999 return std::nullopt;
3000}
3001
3002std::optional<Instruction *>
3004 IntrinsicInst &II) const {
3006 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3007 return I;
3008
3009 Intrinsic::ID IID = II.getIntrinsicID();
3010 switch (IID) {
3011 default:
3012 break;
3013 case Intrinsic::aarch64_dmb:
3014 return instCombineDMB(IC, II);
3015 case Intrinsic::aarch64_neon_fmaxnm:
3016 case Intrinsic::aarch64_neon_fminnm:
3017 return instCombineMaxMinNM(IC, II);
3018 case Intrinsic::aarch64_sve_convert_from_svbool:
3019 return instCombineConvertFromSVBool(IC, II);
3020 case Intrinsic::aarch64_sve_dup:
3021 return instCombineSVEDup(IC, II);
3022 case Intrinsic::aarch64_sve_dup_x:
3023 return instCombineSVEDupX(IC, II);
3024 case Intrinsic::aarch64_sve_cmpne:
3025 case Intrinsic::aarch64_sve_cmpne_wide:
3026 return instCombineSVECmpNE(IC, II);
3027 case Intrinsic::aarch64_sve_rdffr:
3028 return instCombineRDFFR(IC, II);
3029 case Intrinsic::aarch64_sve_lasta:
3030 case Intrinsic::aarch64_sve_lastb:
3031 return instCombineSVELast(IC, II);
3032 case Intrinsic::aarch64_sve_clasta_n:
3033 case Intrinsic::aarch64_sve_clastb_n:
3034 return instCombineSVECondLast(IC, II);
3035 case Intrinsic::aarch64_sve_cntd:
3036 return instCombineSVECntElts(IC, II, 2);
3037 case Intrinsic::aarch64_sve_cntw:
3038 return instCombineSVECntElts(IC, II, 4);
3039 case Intrinsic::aarch64_sve_cnth:
3040 return instCombineSVECntElts(IC, II, 8);
3041 case Intrinsic::aarch64_sve_cntb:
3042 return instCombineSVECntElts(IC, II, 16);
3043 case Intrinsic::aarch64_sme_cntsd:
3044 return instCombineSMECntsd(IC, II, ST);
3045 case Intrinsic::aarch64_sve_ptest_any:
3046 case Intrinsic::aarch64_sve_ptest_first:
3047 case Intrinsic::aarch64_sve_ptest_last:
3048 return instCombineSVEPTest(IC, II);
3049 case Intrinsic::aarch64_sve_fadd:
3050 return instCombineSVEVectorFAdd(IC, II);
3051 case Intrinsic::aarch64_sve_fadd_u:
3052 return instCombineSVEVectorFAddU(IC, II);
3053 case Intrinsic::aarch64_sve_fmul_u:
3054 return instCombineSVEVectorBinOp(IC, II);
3055 case Intrinsic::aarch64_sve_fsub:
3056 return instCombineSVEVectorFSub(IC, II);
3057 case Intrinsic::aarch64_sve_fsub_u:
3058 return instCombineSVEVectorFSubU(IC, II);
3059 case Intrinsic::aarch64_sve_add:
3060 return instCombineSVEVectorAdd(IC, II);
3061 case Intrinsic::aarch64_sve_add_u:
3062 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3063 Intrinsic::aarch64_sve_mla_u>(
3064 IC, II, true);
3065 case Intrinsic::aarch64_sve_sub:
3066 return instCombineSVEVectorSub(IC, II);
3067 case Intrinsic::aarch64_sve_sub_u:
3068 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3069 Intrinsic::aarch64_sve_mls_u>(
3070 IC, II, true);
3071 case Intrinsic::aarch64_sve_tbl:
3072 return instCombineSVETBL(IC, II);
3073 case Intrinsic::aarch64_sve_uunpkhi:
3074 case Intrinsic::aarch64_sve_uunpklo:
3075 case Intrinsic::aarch64_sve_sunpkhi:
3076 case Intrinsic::aarch64_sve_sunpklo:
3077 return instCombineSVEUnpack(IC, II);
3078 case Intrinsic::aarch64_sve_uzp1:
3079 return instCombineSVEUzp1(IC, II);
3080 case Intrinsic::aarch64_sve_zip1:
3081 case Intrinsic::aarch64_sve_zip2:
3082 return instCombineSVEZip(IC, II);
3083 case Intrinsic::aarch64_sve_ld1_gather_index:
3084 return instCombineLD1GatherIndex(IC, II);
3085 case Intrinsic::aarch64_sve_st1_scatter_index:
3086 return instCombineST1ScatterIndex(IC, II);
3087 case Intrinsic::aarch64_sve_ld1:
3088 return instCombineSVELD1(IC, II, DL);
3089 case Intrinsic::aarch64_sve_st1:
3090 return instCombineSVEST1(IC, II, DL);
3091 case Intrinsic::aarch64_sve_sdiv:
3092 return instCombineSVESDIV(IC, II);
3093 case Intrinsic::aarch64_sve_sel:
3094 return instCombineSVESel(IC, II);
3095 case Intrinsic::aarch64_sve_srshl:
3096 return instCombineSVESrshl(IC, II);
3097 case Intrinsic::aarch64_sve_dupq_lane:
3098 return instCombineSVEDupqLane(IC, II);
3099 case Intrinsic::aarch64_sve_insr:
3100 return instCombineSVEInsr(IC, II);
3101 case Intrinsic::aarch64_sve_whilelo:
3102 return instCombineWhilelo(IC, II);
3103 case Intrinsic::aarch64_sve_ptrue:
3104 return instCombinePTrue(IC, II);
3105 case Intrinsic::aarch64_sve_uxtb:
3106 return instCombineSVEUxt(IC, II, 8);
3107 case Intrinsic::aarch64_sve_uxth:
3108 return instCombineSVEUxt(IC, II, 16);
3109 case Intrinsic::aarch64_sve_uxtw:
3110 return instCombineSVEUxt(IC, II, 32);
3111 case Intrinsic::aarch64_sme_in_streaming_mode:
3112 return instCombineInStreamingMode(IC, II);
3113 }
3114
3115 return std::nullopt;
3116}
3117
3119 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3120 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3121 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3122 SimplifyAndSetOp) const {
3123 switch (II.getIntrinsicID()) {
3124 default:
3125 break;
3126 case Intrinsic::aarch64_neon_fcvtxn:
3127 case Intrinsic::aarch64_neon_rshrn:
3128 case Intrinsic::aarch64_neon_sqrshrn:
3129 case Intrinsic::aarch64_neon_sqrshrun:
3130 case Intrinsic::aarch64_neon_sqshrn:
3131 case Intrinsic::aarch64_neon_sqshrun:
3132 case Intrinsic::aarch64_neon_sqxtn:
3133 case Intrinsic::aarch64_neon_sqxtun:
3134 case Intrinsic::aarch64_neon_uqrshrn:
3135 case Intrinsic::aarch64_neon_uqshrn:
3136 case Intrinsic::aarch64_neon_uqxtn:
3137 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3138 break;
3139 }
3140
3141 return std::nullopt;
3142}
3143
3145 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3147}
3148
3151 switch (K) {
3153 return TypeSize::getFixed(64);
3155 if (ST->useSVEForFixedLengthVectors() &&
3156 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3157 return TypeSize::getFixed(
3158 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3159 else if (ST->isNeonAvailable())
3160 return TypeSize::getFixed(128);
3161 else
3162 return TypeSize::getFixed(0);
3164 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3166 return TypeSize::getScalable(128);
3167 else
3168 return TypeSize::getScalable(0);
3169 }
3170 llvm_unreachable("Unsupported register kind");
3171}
3172
3173bool AArch64TTIImpl::isSingleExtWideningInstruction(
3174 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3175 Type *SrcOverrideTy) const {
3176 // A helper that returns a vector type from the given type. The number of
3177 // elements in type Ty determines the vector width.
3178 auto toVectorTy = [&](Type *ArgTy) {
3179 return VectorType::get(ArgTy->getScalarType(),
3180 cast<VectorType>(DstTy)->getElementCount());
3181 };
3182
3183 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3184 // i32, i64]. SVE doesn't generally have the same set of instructions to
3185 // perform an extend with the add/sub/mul. There are SMULLB style
3186 // instructions, but they operate on top/bottom, requiring some sort of lane
3187 // interleaving to be used with zext/sext.
3188 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3189 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3190 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3191 return false;
3192
3193 Type *SrcTy = SrcOverrideTy;
3194 switch (Opcode) {
3195 case Instruction::Add: // UADDW(2), SADDW(2).
3196 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3197 // The second operand needs to be an extend
3198 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3199 if (!SrcTy)
3200 SrcTy =
3201 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3202 break;
3203 }
3204
3205 if (Opcode == Instruction::Sub)
3206 return false;
3207
3208 // UADDW(2), SADDW(2) can be commutted.
3209 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3210 if (!SrcTy)
3211 SrcTy =
3212 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3213 break;
3214 }
3215 return false;
3216 }
3217 default:
3218 return false;
3219 }
3220
3221 // Legalize the destination type and ensure it can be used in a widening
3222 // operation.
3223 auto DstTyL = getTypeLegalizationCost(DstTy);
3224 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3225 return false;
3226
3227 // Legalize the source type and ensure it can be used in a widening
3228 // operation.
3229 assert(SrcTy && "Expected some SrcTy");
3230 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3231 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3232 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3233 return false;
3234
3235 // Get the total number of vector elements in the legalized types.
3236 InstructionCost NumDstEls =
3237 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3238 InstructionCost NumSrcEls =
3239 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3240
3241 // Return true if the legalized types have the same number of vector elements
3242 // and the destination element type size is twice that of the source type.
3243 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3244}
3245
3246Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3248 Type *SrcOverrideTy) const {
3249 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3250 Opcode != Instruction::Mul)
3251 return nullptr;
3252
3253 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3254 // i32, i64]. SVE doesn't generally have the same set of instructions to
3255 // perform an extend with the add/sub/mul. There are SMULLB style
3256 // instructions, but they operate on top/bottom, requiring some sort of lane
3257 // interleaving to be used with zext/sext.
3258 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3259 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3260 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3261 return nullptr;
3262
3263 auto getScalarSizeWithOverride = [&](const Value *V) {
3264 if (SrcOverrideTy)
3265 return SrcOverrideTy->getScalarSizeInBits();
3266 return cast<Instruction>(V)
3267 ->getOperand(0)
3268 ->getType()
3269 ->getScalarSizeInBits();
3270 };
3271
3272 unsigned MaxEltSize = 0;
3273 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3274 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3275 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3276 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3277 MaxEltSize = std::max(EltSize0, EltSize1);
3278 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3279 isa<SExtInst, ZExtInst>(Args[1])) {
3280 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3281 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3282 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3283 // enough.
3284 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3285 return nullptr;
3286 MaxEltSize = DstEltSize / 2;
3287 } else if (Opcode == Instruction::Mul &&
3288 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3289 // If one of the operands is a Zext and the other has enough zero bits
3290 // to be treated as unsigned, we can still generate a umull, meaning the
3291 // zext is free.
3292 KnownBits Known =
3293 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3294 if (Args[0]->getType()->getScalarSizeInBits() -
3295 Known.Zero.countLeadingOnes() >
3296 DstTy->getScalarSizeInBits() / 2)
3297 return nullptr;
3298
3299 MaxEltSize =
3300 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3301 } else
3302 return nullptr;
3303
3304 if (MaxEltSize * 2 > DstEltSize)
3305 return nullptr;
3306
3307 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3308 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3309 return nullptr;
3310 return ExtTy;
3311}
3312
3313// s/urhadd instructions implement the following pattern, making the
3314// extends free:
3315// %x = add ((zext i8 -> i16), 1)
3316// %y = (zext i8 -> i16)
3317// trunc i16 (lshr (add %x, %y), 1) -> i8
3318//
3320 Type *Src) const {
3321 // The source should be a legal vector type.
3322 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3323 (Src->isScalableTy() && !ST->hasSVE2()))
3324 return false;
3325
3326 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3327 return false;
3328
3329 // Look for trunc/shl/add before trying to match the pattern.
3330 const Instruction *Add = ExtUser;
3331 auto *AddUser =
3332 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3333 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3334 Add = AddUser;
3335
3336 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3337 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3338 return false;
3339
3340 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3341 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3342 Src->getScalarSizeInBits() !=
3343 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3344 return false;
3345
3346 // Try to match the whole pattern. Ext could be either the first or second
3347 // m_ZExtOrSExt matched.
3348 Instruction *Ex1, *Ex2;
3349 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3350 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3351 return false;
3352
3353 // Ensure both extends are of the same type
3354 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3355 Ex1->getOpcode() == Ex2->getOpcode())
3356 return true;
3357
3358 return false;
3359}
3360
3362 Type *Src,
3365 const Instruction *I) const {
3366 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3367 assert(ISD && "Invalid opcode");
3368 // If the cast is observable, and it is used by a widening instruction (e.g.,
3369 // uaddl, saddw, etc.), it may be free.
3370 if (I && I->hasOneUser()) {
3371 auto *SingleUser = cast<Instruction>(*I->user_begin());
3372 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3373 if (Type *ExtTy = isBinExtWideningInstruction(
3374 SingleUser->getOpcode(), Dst, Operands,
3375 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3376 // The cost from Src->Src*2 needs to be added if required, the cost from
3377 // Src*2->ExtTy is free.
3378 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3379 Type *DoubleSrcTy =
3380 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3381 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3383 }
3384
3385 return 0;
3386 }
3387
3388 if (isSingleExtWideningInstruction(
3389 SingleUser->getOpcode(), Dst, Operands,
3390 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3391 // For adds only count the second operand as free if both operands are
3392 // extends but not the same operation. (i.e both operands are not free in
3393 // add(sext, zext)).
3394 if (SingleUser->getOpcode() == Instruction::Add) {
3395 if (I == SingleUser->getOperand(1) ||
3396 (isa<CastInst>(SingleUser->getOperand(1)) &&
3397 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3398 return 0;
3399 } else {
3400 // Others are free so long as isSingleExtWideningInstruction
3401 // returned true.
3402 return 0;
3403 }
3404 }
3405
3406 // The cast will be free for the s/urhadd instructions
3407 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3408 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3409 return 0;
3410 }
3411
3412 EVT SrcTy = TLI->getValueType(DL, Src);
3413 EVT DstTy = TLI->getValueType(DL, Dst);
3414
3415 if (!SrcTy.isSimple() || !DstTy.isSimple())
3416 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3417
3418 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3419 // we use fcvtx under SVE2. Give them invalid costs.
3420 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3421 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3422 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3424
3425 static const TypeConversionCostTblEntry BF16Tbl[] = {
3426 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3427 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3428 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3429 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3430 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3431 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3432 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3433 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3434 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3435 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3436 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3437 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3438 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3439 };
3440
3441 if (ST->hasBF16())
3442 if (const auto *Entry = ConvertCostTableLookup(
3443 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3444 return Entry->Cost;
3445
3446 // We have to estimate a cost of fixed length operation upon
3447 // SVE registers(operations) with the number of registers required
3448 // for a fixed type to be represented upon SVE registers.
3449 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3450 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3451 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3452 ST->useSVEForFixedLengthVectors(WiderTy)) {
3453 std::pair<InstructionCost, MVT> LT =
3454 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3455 unsigned NumElements =
3456 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3457 return LT.first *
3459 Opcode,
3460 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3461 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3462 CostKind, I);
3463 }
3464
3465 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3466 // The cost of unpacking twice is artificially increased for now in order
3467 // to avoid regressions against NEON, which will use tbl instructions directly
3468 // instead of multiple layers of [s|u]unpk[lo|hi].
3469 // We use the unpacks in cases where the destination type is illegal and
3470 // requires splitting of the input, even if the input type itself is legal.
3471 const unsigned int SVE_EXT_COST = 1;
3472 const unsigned int SVE_FCVT_COST = 1;
3473 const unsigned int SVE_UNPACK_ONCE = 4;
3474 const unsigned int SVE_UNPACK_TWICE = 16;
3475
3476 static const TypeConversionCostTblEntry ConversionTbl[] = {
3477 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3478 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3479 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3480 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3481 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3482 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3483 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3484 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3485 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3486 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3487 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3488 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3489 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3490 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3491 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3492 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3493 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3494 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3495 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3496 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3497
3498 // Truncations on nxvmiN
3499 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3500 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3501 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3502 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3503 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3504 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3505 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3506 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3507 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3508 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3509 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3510 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3511 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3512 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3513 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3514 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3515 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3516 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3517 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3518 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3519 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3520 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3521 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3522 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3523 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3524 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3525 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3526 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3527 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3528 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3529 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3530 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3531 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3532
3533 // The number of shll instructions for the extension.
3534 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3535 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3536 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3537 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3538 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3539 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3540 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3541 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3542 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3543 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3544 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3545 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3546 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3547 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3548 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3549 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3550
3551 // FP Ext and trunc
3552 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3553 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3554 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3555 // FP16
3556 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3557 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3558 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3559 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3560 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3561 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3562 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3563 // BF16 (uses shift)
3564 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3565 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3566 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3567 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3568 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3569 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3570 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3571 // FP Ext and trunc
3572 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3573 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3574 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3575 // FP16
3576 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3577 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3578 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3579 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3580 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3581 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3582 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3583 // BF16 (more complex, with +bf16 is handled above)
3584 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3585 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3586 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3587 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3588 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3589 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3590 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3591 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3592
3593 // LowerVectorINT_TO_FP:
3594 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3595 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3596 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3597 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3598 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3599 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3600
3601 // SVE: to nxv2f16
3602 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3603 SVE_EXT_COST + SVE_FCVT_COST},
3604 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3605 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3606 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3607 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3608 SVE_EXT_COST + SVE_FCVT_COST},
3609 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3610 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3611 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3612
3613 // SVE: to nxv4f16
3614 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3615 SVE_EXT_COST + SVE_FCVT_COST},
3616 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3617 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3618 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3619 SVE_EXT_COST + SVE_FCVT_COST},
3620 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3622
3623 // SVE: to nxv8f16
3624 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3625 SVE_EXT_COST + SVE_FCVT_COST},
3626 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3627 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3628 SVE_EXT_COST + SVE_FCVT_COST},
3629 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3630
3631 // SVE: to nxv16f16
3632 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3633 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3634 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3635 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3636
3637 // Complex: to v2f32
3638 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3639 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3640 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3641 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3642
3643 // SVE: to nxv2f32
3644 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3645 SVE_EXT_COST + SVE_FCVT_COST},
3646 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3647 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3648 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3649 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3650 SVE_EXT_COST + SVE_FCVT_COST},
3651 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3652 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3653 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3654
3655 // Complex: to v4f32
3656 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3657 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3658 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3659 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3660
3661 // SVE: to nxv4f32
3662 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3663 SVE_EXT_COST + SVE_FCVT_COST},
3664 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3665 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3666 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3667 SVE_EXT_COST + SVE_FCVT_COST},
3668 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3669 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3670
3671 // Complex: to v8f32
3672 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3673 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3674 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3675 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3676
3677 // SVE: to nxv8f32
3678 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3679 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3680 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3681 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3682 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3683 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3684 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3685 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3686
3687 // SVE: to nxv16f32
3688 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3689 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3690 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3691 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3692
3693 // Complex: to v16f32
3694 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3695 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3696
3697 // Complex: to v2f64
3698 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3699 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3700 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3701 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3702 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3703 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3704
3705 // SVE: to nxv2f64
3706 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3707 SVE_EXT_COST + SVE_FCVT_COST},
3708 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3709 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3710 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3711 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3712 SVE_EXT_COST + SVE_FCVT_COST},
3713 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3714 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3715 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3716
3717 // Complex: to v4f64
3718 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3719 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3720
3721 // SVE: to nxv4f64
3722 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3723 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3724 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3725 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3726 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3727 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3728 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3729 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3730 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3731 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3732 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3733 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3734
3735 // SVE: to nxv8f64
3736 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3737 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3738 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3739 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3740 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3741 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3742 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3743 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3744
3745 // LowerVectorFP_TO_INT
3746 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3747 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3748 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3749 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3750 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3751 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3752
3753 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3754 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3755 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3756 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3757 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3758 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3759 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3760
3761 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3762 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3763 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3764 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3765 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3766
3767 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3768 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3769 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3770 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3771 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3772 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3773 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3774
3775 // Complex, from nxv2f32.
3776 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3777 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3778 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3779 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3780 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3781 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3782 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3783 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3784
3785 // Complex, from nxv2f64.
3786 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3787 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3788 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3789 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3790 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3791 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3792 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3793 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3794 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3795 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3796
3797 // Complex, from nxv4f32.
3798 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3799 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3800 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3801 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3802 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3803 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3804 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3805 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3806 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3807 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3808
3809 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3810 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3811 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3812 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3813 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3814
3815 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3816 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3817 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3818 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3819 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3820 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3821 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3822
3823 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3824 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3825 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3826 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3827 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3828
3829 // Complex, from nxv8f16.
3830 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3831 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3832 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3833 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3834 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3835 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3836 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3837 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3838 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3839 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3840
3841 // Complex, from nxv4f16.
3842 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3843 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3844 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3845 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3846 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3847 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3848 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3849 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3850
3851 // Complex, from nxv2f16.
3852 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3853 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3854 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3855 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3856 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3857 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3858 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3859 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3860
3861 // Truncate from nxvmf32 to nxvmf16.
3862 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3863 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3864 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3865
3866 // Truncate from nxvmf32 to nxvmbf16.
3867 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3868 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3869 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3870
3871 // Truncate from nxvmf64 to nxvmf16.
3872 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3873 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3874 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3875
3876 // Truncate from nxvmf64 to nxvmbf16.
3877 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3878 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3879 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3880
3881 // Truncate from nxvmf64 to nxvmf32.
3882 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3883 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3884 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3885
3886 // Extend from nxvmf16 to nxvmf32.
3887 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3888 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3889 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3890
3891 // Extend from nxvmbf16 to nxvmf32.
3892 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3893 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3894 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3895
3896 // Extend from nxvmf16 to nxvmf64.
3897 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3898 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3899 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3900
3901 // Extend from nxvmbf16 to nxvmf64.
3902 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3903 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3904 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3905
3906 // Extend from nxvmf32 to nxvmf64.
3907 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3908 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3909 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3910
3911 // Bitcasts from float to integer
3912 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3913 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3914 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3915
3916 // Bitcasts from integer to float
3917 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3918 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3919 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3920
3921 // Add cost for extending to illegal -too wide- scalable vectors.
3922 // zero/sign extend are implemented by multiple unpack operations,
3923 // where each operation has a cost of 1.
3924 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3925 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3926 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3927 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3928 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3929 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3930
3931 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3932 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3933 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3934 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3935 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3936 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3937 };
3938
3939 if (const auto *Entry = ConvertCostTableLookup(
3940 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3941 return Entry->Cost;
3942
3943 static const TypeConversionCostTblEntry FP16Tbl[] = {
3944 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3945 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3946 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3947 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3948 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3949 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3950 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3951 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3952 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3953 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3954 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3955 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3956 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3957 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3958 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3959 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3960 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3961 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3962 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3963 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3964 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3965 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3966 };
3967
3968 if (ST->hasFullFP16())
3969 if (const auto *Entry = ConvertCostTableLookup(
3970 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3971 return Entry->Cost;
3972
3973 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3974 // double-rounding issues.
3975 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3976 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3978 return cast<FixedVectorType>(Dst)->getNumElements() *
3979 getCastInstrCost(Opcode, Dst->getScalarType(),
3980 Src->getScalarType(), CCH, CostKind) +
3982 true, CostKind) +
3984 false, CostKind);
3985
3986 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3988 ST->isSVEorStreamingSVEAvailable() &&
3989 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3991 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3993 // The standard behaviour in the backend for these cases is to split the
3994 // extend up into two parts:
3995 // 1. Perform an extending load or masked load up to the legal type.
3996 // 2. Extend the loaded data to the final type.
3997 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3998 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
4000 Opcode, LegalTy, Src, CCH, CostKind, I);
4002 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
4003 return Part1 + Part2;
4004 }
4005
4006 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4007 // but we also want to include the TTI::CastContextHint::Masked case too.
4008 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4010 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4012
4013 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4014}
4015
4018 VectorType *VecTy, unsigned Index,
4020
4021 // Make sure we were given a valid extend opcode.
4022 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4023 "Invalid opcode");
4024
4025 // We are extending an element we extract from a vector, so the source type
4026 // of the extend is the element type of the vector.
4027 auto *Src = VecTy->getElementType();
4028
4029 // Sign- and zero-extends are for integer types only.
4030 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4031
4032 // Get the cost for the extract. We compute the cost (if any) for the extend
4033 // below.
4034 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4035 CostKind, Index, nullptr, nullptr);
4036
4037 // Legalize the types.
4038 auto VecLT = getTypeLegalizationCost(VecTy);
4039 auto DstVT = TLI->getValueType(DL, Dst);
4040 auto SrcVT = TLI->getValueType(DL, Src);
4041
4042 // If the resulting type is still a vector and the destination type is legal,
4043 // we may get the extension for free. If not, get the default cost for the
4044 // extend.
4045 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4046 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4047 CostKind);
4048
4049 // The destination type should be larger than the element type. If not, get
4050 // the default cost for the extend.
4051 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4052 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4053 CostKind);
4054
4055 switch (Opcode) {
4056 default:
4057 llvm_unreachable("Opcode should be either SExt or ZExt");
4058
4059 // For sign-extends, we only need a smov, which performs the extension
4060 // automatically.
4061 case Instruction::SExt:
4062 return Cost;
4063
4064 // For zero-extends, the extend is performed automatically by a umov unless
4065 // the destination type is i64 and the element type is i8 or i16.
4066 case Instruction::ZExt:
4067 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4068 return Cost;
4069 }
4070
4071 // If we are unable to perform the extend for free, get the default cost.
4072 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4073 CostKind);
4074}
4075
4078 const Instruction *I) const {
4080 return Opcode == Instruction::PHI ? 0 : 1;
4081 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4082 // Branches are assumed to be predicted.
4083 return 0;
4084}
4085
4086InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4087 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4088 const Instruction *I, Value *Scalar,
4089 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4090 TTI::VectorInstrContext VIC) const {
4091 assert(Val->isVectorTy() && "This must be a vector type");
4092
4093 if (Index != -1U) {
4094 // Legalize the type.
4095 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4096
4097 // This type is legalized to a scalar type.
4098 if (!LT.second.isVector())
4099 return 0;
4100
4101 // The type may be split. For fixed-width vectors we can normalize the
4102 // index to the new type.
4103 if (LT.second.isFixedLengthVector()) {
4104 unsigned Width = LT.second.getVectorNumElements();
4105 Index = Index % Width;
4106 }
4107
4108 // The element at index zero is already inside the vector.
4109 // - For a insert-element or extract-element
4110 // instruction that extracts integers, an explicit FPR -> GPR move is
4111 // needed. So it has non-zero cost.
4112 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4113 return 0;
4114
4115 // This is recognising a LD1 single-element structure to one lane of one
4116 // register instruction. I.e., if this is an `insertelement` instruction,
4117 // and its second operand is a load, then we will generate a LD1, which
4118 // are expensive instructions on some uArchs.
4119 if (VIC == TTI::VectorInstrContext::Load) {
4120 if (ST->hasFastLD1Single())
4121 return 0;
4122 return CostKind == TTI::TCK_CodeSize
4123 ? 0
4125 }
4126
4127 // i1 inserts and extract will include an extra cset or cmp of the vector
4128 // value. Increase the cost by 1 to account.
4129 if (Val->getScalarSizeInBits() == 1)
4130 return CostKind == TTI::TCK_CodeSize
4131 ? 2
4132 : ST->getVectorInsertExtractBaseCost() + 1;
4133
4134 // FIXME:
4135 // If the extract-element and insert-element instructions could be
4136 // simplified away (e.g., could be combined into users by looking at use-def
4137 // context), they have no cost. This is not done in the first place for
4138 // compile-time considerations.
4139 }
4140
4141 // In case of Neon, if there exists extractelement from lane != 0 such that
4142 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4143 // 2. extractelement result feeds into fmul.
4144 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4145 // equivalent to 0.
4146 // then the extractelement can be merged with fmul in the backend and it
4147 // incurs no cost.
4148 // e.g.
4149 // define double @foo(<2 x double> %a) {
4150 // %1 = extractelement <2 x double> %a, i32 0
4151 // %2 = extractelement <2 x double> %a, i32 1
4152 // %res = fmul double %1, %2
4153 // ret double %res
4154 // }
4155 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4156 auto ExtractCanFuseWithFmul = [&]() {
4157 // We bail out if the extract is from lane 0.
4158 if (Index == 0)
4159 return false;
4160
4161 // Check if the scalar element type of the vector operand of ExtractElement
4162 // instruction is one of the allowed types.
4163 auto IsAllowedScalarTy = [&](const Type *T) {
4164 return T->isFloatTy() || T->isDoubleTy() ||
4165 (T->isHalfTy() && ST->hasFullFP16());
4166 };
4167
4168 // Check if the extractelement user is scalar fmul.
4169 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4170 // Check if the user is scalar fmul.
4171 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4172 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4173 !BO->getType()->isVectorTy();
4174 };
4175
4176 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4177 // certain scalar type and a certain vector register width.
4178 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4179 auto RegWidth =
4181 .getFixedValue();
4182 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4183 };
4184
4185 // Check if the type constraints on input vector type and result scalar type
4186 // of extractelement instruction are satisfied.
4187 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4188 return false;
4189
4190 if (Scalar) {
4191 DenseMap<User *, unsigned> UserToExtractIdx;
4192 for (auto *U : Scalar->users()) {
4193 if (!IsUserFMulScalarTy(U))
4194 return false;
4195 // Recording entry for the user is important. Index value is not
4196 // important.
4197 UserToExtractIdx[U];
4198 }
4199 if (UserToExtractIdx.empty())
4200 return false;
4201 for (auto &[S, U, L] : ScalarUserAndIdx) {
4202 for (auto *U : S->users()) {
4203 if (UserToExtractIdx.contains(U)) {
4204 auto *FMul = cast<BinaryOperator>(U);
4205 auto *Op0 = FMul->getOperand(0);
4206 auto *Op1 = FMul->getOperand(1);
4207 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4208 UserToExtractIdx[U] = L;
4209 break;
4210 }
4211 }
4212 }
4213 }
4214 for (auto &[U, L] : UserToExtractIdx) {
4215 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4216 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4217 return false;
4218 }
4219 } else {
4220 const auto *EE = cast<ExtractElementInst>(I);
4221
4222 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4223 if (!IdxOp)
4224 return false;
4225
4226 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4227 if (!IsUserFMulScalarTy(U))
4228 return false;
4229
4230 // Check if the other operand of extractelement is also extractelement
4231 // from lane equivalent to 0.
4232 const auto *BO = cast<BinaryOperator>(U);
4233 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4234 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4235 if (OtherEE) {
4236 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4237 if (!IdxOp)
4238 return false;
4239 return IsExtractLaneEquivalentToZero(
4240 cast<ConstantInt>(OtherEE->getIndexOperand())
4241 ->getValue()
4242 .getZExtValue(),
4243 OtherEE->getType()->getScalarSizeInBits());
4244 }
4245 return true;
4246 });
4247 }
4248 return true;
4249 };
4250
4251 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4252 ExtractCanFuseWithFmul())
4253 return 0;
4254
4255 // All other insert/extracts cost this much.
4256 return CostKind == TTI::TCK_CodeSize ? 1
4257 : ST->getVectorInsertExtractBaseCost();
4258}
4259
4261 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4262 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4263 // Treat insert at lane 0 into a poison vector as having zero cost. This
4264 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4265 // single dup) are treated as cheap.
4266 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4267 isa<PoisonValue>(Op0))
4268 return 0;
4269 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4270 nullptr, {}, VIC);
4271}
4272
4274 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4275 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4276 TTI::VectorInstrContext VIC) const {
4277 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4278 ScalarUserAndIdx, VIC);
4279}
4280
4283 TTI::TargetCostKind CostKind, unsigned Index,
4284 TTI::VectorInstrContext VIC) const {
4285 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4286 nullptr, {}, VIC);
4287}
4288
4292 unsigned Index) const {
4293 if (isa<FixedVectorType>(Val))
4295 Index);
4296
4297 // This typically requires both while and lastb instructions in order
4298 // to extract the last element. If this is in a loop the while
4299 // instruction can at least be hoisted out, although it will consume a
4300 // predicate register. The cost should be more expensive than the base
4301 // extract cost, which is 2 for most CPUs.
4302 return CostKind == TTI::TCK_CodeSize
4303 ? 2
4304 : ST->getVectorInsertExtractBaseCost() + 1;
4305}
4306
4308 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4309 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4310 TTI::VectorInstrContext VIC) const {
4313 if (Ty->getElementType()->isFloatingPointTy())
4314 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4315 CostKind);
4316 unsigned VecInstCost =
4317 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4318 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4319}
4320
4321std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4323 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4324 std::function<InstructionCost(Type *)> InstCost) const {
4325 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4326 return std::nullopt;
4327 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4328 return std::nullopt;
4329 // If we have +sve-b16b16 the operation can be promoted to SVE.
4330 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4331 return std::nullopt;
4332
4333 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4334 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4336 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4337 Cost *= 2;
4338 Cost += InstCost(PromotedTy);
4339 if (IncludeTrunc)
4340 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4342 return Cost;
4343}
4344
4346 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4348 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4349
4350 // The code-generator is currently not able to handle scalable vectors
4351 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4352 // it. This change will be removed when code-generation for these types is
4353 // sufficiently reliable.
4354 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4355 if (VTy->getElementCount() == ElementCount::getScalable(1))
4357
4358 // TODO: Handle more cost kinds.
4360 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4361 Op2Info, Args, CxtI);
4362
4363 // Legalize the type.
4364 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4365 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4366
4367 // Increase the cost for half and bfloat types if not architecturally
4368 // supported.
4369 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4370 ISD == ISD::FDIV || ISD == ISD::FREM)
4371 if (auto PromotedCost = getFP16BF16PromoteCost(
4372 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4373 // There is not native support for fdiv/frem even with +sve-b16b16.
4374 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4375 [&](Type *PromotedTy) {
4376 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4377 Op1Info, Op2Info);
4378 }))
4379 return *PromotedCost;
4380
4381 // If the operation is a widening instruction (smull or umull) and both
4382 // operands are extends the cost can be cheaper by considering that the
4383 // operation will operate on the narrowest type size possible (double the
4384 // largest input size) and a further extend.
4385 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4386 if (ExtTy != Ty)
4387 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4388 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4390 return LT.first;
4391 }
4392
4393 switch (ISD) {
4394 default:
4395 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4396 Op2Info);
4397 case ISD::SREM:
4398 case ISD::SDIV:
4399 /*
4400 Notes for sdiv/srem specific costs:
4401 1. This only considers the cases where the divisor is constant, uniform and
4402 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4403 result in some form of (ldr + adrp), corresponding to constant vectors, or
4404 scalarization of the division operation.
4405 2. Constant divisors, either negative in whole or partially, don't result in
4406 significantly different codegen as compared to positive constant divisors.
4407 So, we don't consider negative divisors separately.
4408 3. If the codegen is significantly different with SVE, it has been indicated
4409 using comments at appropriate places.
4410
4411 sdiv specific cases:
4412 -----------------------------------------------------------------------
4413 codegen | pow-of-2 | Type
4414 -----------------------------------------------------------------------
4415 add + cmp + csel + asr | Y | i64
4416 add + cmp + csel + asr | Y | i32
4417 -----------------------------------------------------------------------
4418
4419 srem specific cases:
4420 -----------------------------------------------------------------------
4421 codegen | pow-of-2 | Type
4422 -----------------------------------------------------------------------
4423 negs + and + and + csneg | Y | i64
4424 negs + and + and + csneg | Y | i32
4425 -----------------------------------------------------------------------
4426
4427 other sdiv/srem cases:
4428 -------------------------------------------------------------------------
4429 common codegen | + srem | + sdiv | pow-of-2 | Type
4430 -------------------------------------------------------------------------
4431 smulh + asr + add + add | - | - | N | i64
4432 smull + lsr + add + add | - | - | N | i32
4433 usra | and + sub | sshr | Y | <2 x i64>
4434 2 * (scalar code) | - | - | N | <2 x i64>
4435 usra | bic + sub | sshr + neg | Y | <4 x i32>
4436 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4437 + sshr + usra | | | |
4438 -------------------------------------------------------------------------
4439 */
4440 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4441 InstructionCost AddCost =
4442 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4443 Op1Info.getNoProps(), Op2Info.getNoProps());
4444 InstructionCost AsrCost =
4445 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4446 Op1Info.getNoProps(), Op2Info.getNoProps());
4447 InstructionCost MulCost =
4448 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4449 Op1Info.getNoProps(), Op2Info.getNoProps());
4450 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4451 // have similar cost.
4452 auto VT = TLI->getValueType(DL, Ty);
4453 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4454 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4455 // Neg can be folded into the asr instruction.
4456 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4457 : (3 * AsrCost + AddCost);
4458 } else {
4459 return MulCost + AsrCost + 2 * AddCost;
4460 }
4461 } else if (VT.isVector()) {
4462 InstructionCost UsraCost = 2 * AsrCost;
4463 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4464 // Division with scalable types corresponds to native 'asrd'
4465 // instruction when SVE is available.
4466 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4467
4468 // One more for the negation in SDIV
4470 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4471 if (Ty->isScalableTy() && ST->hasSVE())
4472 Cost += 2 * AsrCost;
4473 else {
4474 Cost +=
4475 UsraCost +
4476 (ISD == ISD::SDIV
4477 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4478 : 2 * AddCost);
4479 }
4480 return Cost;
4481 } else if (LT.second == MVT::v2i64) {
4482 return VT.getVectorNumElements() *
4483 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4484 Op1Info.getNoProps(),
4485 Op2Info.getNoProps());
4486 } else {
4487 // When SVE is available, we get:
4488 // smulh + lsr + add/sub + asr + add/sub.
4489 if (Ty->isScalableTy() && ST->hasSVE())
4490 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4491 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4492 }
4493 }
4494 }
4495 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4496 LT.second.isFixedLengthVector()) {
4497 // FIXME: When the constant vector is non-uniform, this may result in
4498 // loading the vector from constant pool or in some cases, may also result
4499 // in scalarization. For now, we are approximating this with the
4500 // scalarization cost.
4501 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4502 CostKind, -1, nullptr, nullptr);
4503 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4504 CostKind, -1, nullptr, nullptr);
4505 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4506 return ExtractCost + InsertCost +
4507 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4508 CostKind, Op1Info.getNoProps(),
4509 Op2Info.getNoProps());
4510 }
4511 [[fallthrough]];
4512 case ISD::UDIV:
4513 case ISD::UREM: {
4514 auto VT = TLI->getValueType(DL, Ty);
4515 if (Op2Info.isConstant()) {
4516 // If the operand is a power of 2 we can use the shift or and cost.
4517 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4518 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4519 Op1Info.getNoProps(),
4520 Op2Info.getNoProps());
4521 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4522 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4523 Op1Info.getNoProps(),
4524 Op2Info.getNoProps());
4525
4526 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4527 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4528 // The MULHU will be expanded to UMULL for the types not listed below,
4529 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4530 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4531 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4532 LT.second == MVT::nxv16i8;
4533 bool Is128bit = LT.second.is128BitVector();
4534
4535 InstructionCost MulCost =
4536 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4537 Op1Info.getNoProps(), Op2Info.getNoProps());
4538 InstructionCost AddCost =
4539 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4540 Op1Info.getNoProps(), Op2Info.getNoProps());
4541 InstructionCost ShrCost =
4542 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4543 Op1Info.getNoProps(), Op2Info.getNoProps());
4544 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4545 (HasMULH ? 0 : ShrCost) + // UMULL shift
4546 AddCost * 2 + ShrCost;
4547 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4548 }
4549 }
4550
4551 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4552 // emitted by the backend even when those functions are not declared in the
4553 // module.
4554 if (!VT.isVector() && VT.getSizeInBits() > 64)
4555 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4556
4558 Opcode, Ty, CostKind, Op1Info, Op2Info);
4559 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4560 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4561 // SDIV/UDIV operations are lowered using SVE, then we can have less
4562 // costs.
4563 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4564 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4565 static const CostTblEntry DivTbl[]{
4566 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4567 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4568 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4569 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4570 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4571 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4572
4573 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4574 if (nullptr != Entry)
4575 return Entry->Cost;
4576 }
4577 // For 8/16-bit elements, the cost is higher because the type
4578 // requires promotion and possibly splitting:
4579 if (LT.second.getScalarType() == MVT::i8)
4580 Cost *= 8;
4581 else if (LT.second.getScalarType() == MVT::i16)
4582 Cost *= 4;
4583 return Cost;
4584 } else {
4585 // If one of the operands is a uniform constant then the cost for each
4586 // element is Cost for insertion, extraction and division.
4587 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4588 // operation with scalar type
4589 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4590 (Op2Info.isConstant() && Op2Info.isUniform())) {
4591 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4593 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4594 return (4 + DivCost) * VTy->getNumElements();
4595 }
4596 }
4597 // On AArch64, without SVE, vector divisions are expanded
4598 // into scalar divisions of each pair of elements.
4599 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4600 -1, nullptr, nullptr);
4601 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4602 nullptr, nullptr);
4603 }
4604
4605 // TODO: if one of the arguments is scalar, then it's not necessary to
4606 // double the cost of handling the vector elements.
4607 Cost += Cost;
4608 }
4609 return Cost;
4610 }
4611 case ISD::MUL:
4612 // When SVE is available, then we can lower the v2i64 operation using
4613 // the SVE mul instruction, which has a lower cost.
4614 if (LT.second == MVT::v2i64 && ST->hasSVE())
4615 return LT.first;
4616
4617 // When SVE is not available, there is no MUL.2d instruction,
4618 // which means mul <2 x i64> is expensive as elements are extracted
4619 // from the vectors and the muls scalarized.
4620 // As getScalarizationOverhead is a bit too pessimistic, we
4621 // estimate the cost for a i64 vector directly here, which is:
4622 // - four 2-cost i64 extracts,
4623 // - two 2-cost i64 inserts, and
4624 // - two 1-cost muls.
4625 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4626 // LT.first = 2 the cost is 28.
4627 if (LT.second != MVT::v2i64)
4628 return LT.first;
4629 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4630 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4631 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4632 nullptr, nullptr) *
4633 2 +
4634 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4635 nullptr, nullptr));
4636 case ISD::ADD:
4637 case ISD::XOR:
4638 case ISD::OR:
4639 case ISD::AND:
4640 case ISD::SRL:
4641 case ISD::SRA:
4642 case ISD::SHL:
4643 // These nodes are marked as 'custom' for combining purposes only.
4644 // We know that they are legal. See LowerAdd in ISelLowering.
4645 return LT.first;
4646
4647 case ISD::FNEG:
4648 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4649 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4650 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4651 CxtI &&
4652 ((CxtI->hasOneUse() &&
4653 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4654 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4655 return 0;
4656 [[fallthrough]];
4657 case ISD::FADD:
4658 case ISD::FSUB:
4659 if (!Ty->getScalarType()->isFP128Ty())
4660 return LT.first;
4661 [[fallthrough]];
4662 case ISD::FMUL:
4663 case ISD::FDIV:
4664 // These nodes are marked as 'custom' just to lower them to SVE.
4665 // We know said lowering will incur no additional cost.
4666 if (!Ty->getScalarType()->isFP128Ty())
4667 return 2 * LT.first;
4668
4669 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4670 Op2Info);
4671 case ISD::FREM:
4672 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4673 // those functions are not declared in the module.
4674 if (!Ty->isVectorTy())
4675 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4676 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4677 Op2Info);
4678 }
4679}
4680
4683 const SCEV *Ptr,
4685 // Address computations in vectorized code with non-consecutive addresses will
4686 // likely result in more instructions compared to scalar code where the
4687 // computation can more often be merged into the index mode. The resulting
4688 // extra micro-ops can significantly decrease throughput.
4689 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4690 int MaxMergeDistance = 64;
4691
4692 if (PtrTy->isVectorTy() && SE &&
4693 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4694 return NumVectorInstToHideOverhead;
4695
4696 // In many cases the address computation is not merged into the instruction
4697 // addressing mode.
4698 return 1;
4699}
4700
4701/// Check whether Opcode1 has less throughput according to the scheduling
4702/// model than Opcode2.
4704 unsigned Opcode1, unsigned Opcode2) const {
4705 const MCSchedModel &Sched = ST->getSchedModel();
4706 const TargetInstrInfo *TII = ST->getInstrInfo();
4707 if (!Sched.hasInstrSchedModel())
4708 return false;
4709
4710 const MCSchedClassDesc *SCD1 =
4711 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4712 const MCSchedClassDesc *SCD2 =
4713 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4714 // We cannot handle variant scheduling classes without an MI. If we need to
4715 // support them for any of the instructions we query the information of we
4716 // might need to add a way to resolve them without a MI or not use the
4717 // scheduling info.
4718 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4719 "Cannot handle variant scheduling classes without an MI");
4720 if (!SCD1->isValid() || !SCD2->isValid())
4721 return false;
4722
4723 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4725}
4726
4728 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4730 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4731 // We don't lower some vector selects well that are wider than the register
4732 // width. TODO: Improve this with different cost kinds.
4733 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4734 // We would need this many instructions to hide the scalarization happening.
4735 const int AmortizationCost = 20;
4736
4737 // If VecPred is not set, check if we can get a predicate from the context
4738 // instruction, if its type matches the requested ValTy.
4739 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4740 CmpPredicate CurrentPred;
4741 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4742 m_Value())))
4743 VecPred = CurrentPred;
4744 }
4745 // Check if we have a compare/select chain that can be lowered using
4746 // a (F)CMxx & BFI pair.
4747 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4748 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4749 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4750 VecPred == CmpInst::FCMP_UNE) {
4751 static const auto ValidMinMaxTys = {
4752 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4753 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4754 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4755
4756 auto LT = getTypeLegalizationCost(ValTy);
4757 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4758 (ST->hasFullFP16() &&
4759 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4760 return LT.first;
4761 }
4762
4763 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4764 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4765 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4766 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4767 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4768 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4769 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4770 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4771 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4772 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4773 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4774 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4775
4776 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4777 EVT SelValTy = TLI->getValueType(DL, ValTy);
4778 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4779 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4780 SelCondTy.getSimpleVT(),
4781 SelValTy.getSimpleVT()))
4782 return Entry->Cost;
4783 }
4784 }
4785
4786 if (Opcode == Instruction::FCmp) {
4787 if (auto PromotedCost = getFP16BF16PromoteCost(
4788 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4789 // TODO: Consider costing SVE FCMPs.
4790 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4792 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4793 CostKind, Op1Info, Op2Info);
4794 if (isa<VectorType>(PromotedTy))
4796 Instruction::Trunc,
4800 return Cost;
4801 }))
4802 return *PromotedCost;
4803
4804 auto LT = getTypeLegalizationCost(ValTy);
4805 // Model unknown fp compares as a libcall.
4806 if (LT.second.getScalarType() != MVT::f64 &&
4807 LT.second.getScalarType() != MVT::f32 &&
4808 LT.second.getScalarType() != MVT::f16)
4809 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4810 {ValTy, ValTy}, CostKind);
4811
4812 // Some comparison operators require expanding to multiple compares + or.
4813 unsigned Factor = 1;
4814 if (!CondTy->isVectorTy() &&
4815 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4816 Factor = 2; // fcmp with 2 selects
4817 else if (isa<FixedVectorType>(ValTy) &&
4818 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4819 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4820 Factor = 3; // fcmxx+fcmyy+or
4821 else if (isa<ScalableVectorType>(ValTy) &&
4822 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4823 Factor = 3; // fcmxx+fcmyy+or
4824
4825 if (isa<ScalableVectorType>(ValTy) &&
4827 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4828 AArch64::FCMEQv4f32))
4829 Factor *= 2;
4830
4831 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4832 }
4833
4834 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4835 // icmp(and, 0) as free, as we can make use of ands, but only if the
4836 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4837 // providing it will not cause performance regressions.
4838 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4839 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4840 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4841 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4842 if (match(I->getOperand(1), m_Zero()))
4843 return 0;
4844
4845 // x >= 1 / x < 1 -> x > 0 / x <= 0
4846 if (match(I->getOperand(1), m_One()) &&
4847 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4848 return 0;
4849
4850 // x <= -1 / x > -1 -> x > 0 / x <= 0
4851 if (match(I->getOperand(1), m_AllOnes()) &&
4852 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4853 return 0;
4854 }
4855
4856 // The base case handles scalable vectors fine for now, since it treats the
4857 // cost as 1 * legalization cost.
4858 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4859 Op1Info, Op2Info, I);
4860}
4861
4863AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4865 if (ST->requiresStrictAlign()) {
4866 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4867 // a bunch of instructions when strict align is enabled.
4868 return Options;
4869 }
4870 Options.AllowOverlappingLoads = true;
4871 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4872 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4873 // TODO: Though vector loads usually perform well on AArch64, in some targets
4874 // they may wake up the FP unit, which raises the power consumption. Perhaps
4875 // they could be used with no holds barred (-O3).
4876 Options.LoadSizes = {8, 4, 2, 1};
4877 Options.AllowedTailExpansions = {3, 5, 6};
4878 return Options;
4879}
4880
4882 return ST->hasSVE();
4883}
4884
4888 switch (MICA.getID()) {
4889 case Intrinsic::masked_scatter:
4890 case Intrinsic::masked_gather:
4891 return getGatherScatterOpCost(MICA, CostKind);
4892 case Intrinsic::masked_load:
4893 case Intrinsic::masked_expandload:
4894 case Intrinsic::masked_store:
4895 return getMaskedMemoryOpCost(MICA, CostKind);
4896 }
4898}
4899
4903 Type *Src = MICA.getDataType();
4904
4905 if (useNeonVector(Src))
4907 auto LT = getTypeLegalizationCost(Src);
4908 if (!LT.first.isValid())
4910
4911 // Return an invalid cost for element types that we are unable to lower.
4912 auto *VT = cast<VectorType>(Src);
4913 if (VT->getElementType()->isIntegerTy(1))
4915
4916 // The code-generator is currently not able to handle scalable vectors
4917 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4918 // it. This change will be removed when code-generation for these types is
4919 // sufficiently reliable.
4920 if (VT->getElementCount() == ElementCount::getScalable(1))
4922
4923 InstructionCost MemOpCost = LT.first;
4924 if (MICA.getID() == Intrinsic::masked_expandload) {
4925 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
4927
4928 // Operation will be split into expand of masked.load
4929 MemOpCost *= 2;
4930 }
4931
4932 // If we need to split the memory operation, we will also need to split the
4933 // mask. This will likely lead to overestimating the cost in some cases if
4934 // multiple memory operations use the same mask, but we often don't have
4935 // enough context to figure that out here.
4936 //
4937 // If the elements being loaded are bytes then the mask will already be split,
4938 // since the number of bits in a P register matches the number of bytes in a
4939 // Z register.
4940 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
4941 return MemOpCost * 2;
4942
4943 return MemOpCost;
4944}
4945
4946// This function returns gather/scatter overhead either from
4947// user-provided value or specialized values per-target from \p ST.
4948static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4949 const AArch64Subtarget *ST) {
4950 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4951 "Should be called on only load or stores.");
4952 switch (Opcode) {
4953 case Instruction::Load:
4954 if (SVEGatherOverhead.getNumOccurrences() > 0)
4955 return SVEGatherOverhead;
4956 return ST->getGatherOverhead();
4957 break;
4958 case Instruction::Store:
4959 if (SVEScatterOverhead.getNumOccurrences() > 0)
4960 return SVEScatterOverhead;
4961 return ST->getScatterOverhead();
4962 break;
4963 default:
4964 llvm_unreachable("Shouldn't have reached here");
4965 }
4966}
4967
4971
4972 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4973 MICA.getID() == Intrinsic::vp_gather)
4974 ? Instruction::Load
4975 : Instruction::Store;
4976
4977 Type *DataTy = MICA.getDataType();
4978 Align Alignment = MICA.getAlignment();
4979 const Instruction *I = MICA.getInst();
4980
4981 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4983 auto *VT = cast<VectorType>(DataTy);
4984 auto LT = getTypeLegalizationCost(DataTy);
4985 if (!LT.first.isValid())
4987
4988 // Return an invalid cost for element types that we are unable to lower.
4989 if (!LT.second.isVector() ||
4990 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4991 VT->getElementType()->isIntegerTy(1))
4993
4994 // The code-generator is currently not able to handle scalable vectors
4995 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4996 // it. This change will be removed when code-generation for these types is
4997 // sufficiently reliable.
4998 if (VT->getElementCount() == ElementCount::getScalable(1))
5000
5001 ElementCount LegalVF = LT.second.getVectorElementCount();
5002 InstructionCost MemOpCost =
5003 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5004 {TTI::OK_AnyValue, TTI::OP_None}, I);
5005 // Add on an overhead cost for using gathers/scatters.
5006 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5007 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5008}
5009
5011 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5012}
5013
5015 Align Alignment,
5016 unsigned AddressSpace,
5018 TTI::OperandValueInfo OpInfo,
5019 const Instruction *I) const {
5020 EVT VT = TLI->getValueType(DL, Ty, true);
5021 // Type legalization can't handle structs
5022 if (VT == MVT::Other)
5023 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5024 CostKind);
5025
5026 auto LT = getTypeLegalizationCost(Ty);
5027 if (!LT.first.isValid())
5029
5030 // The code-generator is currently not able to handle scalable vectors
5031 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5032 // it. This change will be removed when code-generation for these types is
5033 // sufficiently reliable.
5034 // We also only support full register predicate loads and stores.
5035 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5036 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5037 (VTy->getElementType()->isIntegerTy(1) &&
5038 !VTy->getElementCount().isKnownMultipleOf(
5041
5042 // TODO: consider latency as well for TCK_SizeAndLatency.
5044 return LT.first;
5045
5047 return 1;
5048
5049 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5050 LT.second.is128BitVector() && Alignment < Align(16)) {
5051 // Unaligned stores are extremely inefficient. We don't split all
5052 // unaligned 128-bit stores because the negative impact that has shown in
5053 // practice on inlined block copy code.
5054 // We make such stores expensive so that we will only vectorize if there
5055 // are 6 other instructions getting vectorized.
5056 const int AmortizationCost = 6;
5057
5058 return LT.first * 2 * AmortizationCost;
5059 }
5060
5061 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5062 if (Ty->isPtrOrPtrVectorTy())
5063 return LT.first;
5064
5065 if (useNeonVector(Ty)) {
5066 // Check truncating stores and extending loads.
5067 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5068 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5069 if (VT == MVT::v4i8)
5070 return 2;
5071 // Otherwise we need to scalarize.
5072 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5073 }
5074 EVT EltVT = VT.getVectorElementType();
5075 unsigned EltSize = EltVT.getScalarSizeInBits();
5076 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5077 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5078 return LT.first;
5079 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5080 // widening to v4i8, which produces suboptimal results.
5081 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5082 return LT.first;
5083
5084 // Check non-power-of-2 loads/stores for legal vector element types with
5085 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5086 // operations on smaller power-of-2 ops, including ld1/st1.
5087 LLVMContext &C = Ty->getContext();
5089 SmallVector<EVT> TypeWorklist;
5090 TypeWorklist.push_back(VT);
5091 while (!TypeWorklist.empty()) {
5092 EVT CurrVT = TypeWorklist.pop_back_val();
5093 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5094 if (isPowerOf2_32(CurrNumElements)) {
5095 Cost += 1;
5096 continue;
5097 }
5098
5099 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5100 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5101 TypeWorklist.push_back(
5102 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5103 }
5104 return Cost;
5105 }
5106
5107 return LT.first;
5108}
5109
5111 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5112 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5113 bool UseMaskForCond, bool UseMaskForGaps) const {
5114 assert(Factor >= 2 && "Invalid interleave factor");
5115 auto *VecVTy = cast<VectorType>(VecTy);
5116
5117 if (VecTy->isScalableTy() && !ST->hasSVE())
5119
5120 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5121 // only have lowering for power-of-2 factors.
5122 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5123 // InterleavedAccessPass for ld3/st3
5124 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5126
5127 // Vectorization for masked interleaved accesses is only enabled for scalable
5128 // VF.
5129 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5131
5132 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5133 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5134 auto *SubVecTy =
5135 VectorType::get(VecVTy->getElementType(),
5136 VecVTy->getElementCount().divideCoefficientBy(Factor));
5137
5138 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5139 // Accesses having vector types that are a multiple of 128 bits can be
5140 // matched to more than one ldN/stN instruction.
5141 bool UseScalable;
5142 if (MinElts % Factor == 0 &&
5143 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5144 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5145 }
5146
5147 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5148 Alignment, AddressSpace, CostKind,
5149 UseMaskForCond, UseMaskForGaps);
5150}
5151
5156 for (auto *I : Tys) {
5157 if (!I->isVectorTy())
5158 continue;
5159 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5160 128)
5161 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5162 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5163 }
5164 return Cost;
5165}
5166
5168 Align Alignment) const {
5169 // Neon types should be scalarised when we are not choosing to use SVE.
5170 if (useNeonVector(DataTy))
5171 return false;
5172
5173 // Return true only if we are able to lower using the SVE2p2/SME2p2
5174 // expand instruction.
5175 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5176 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5177}
5178
5180 return ST->getMaxInterleaveFactor();
5181}
5182
5183// For Falkor, we want to avoid having too many strided loads in a loop since
5184// that can exhaust the HW prefetcher resources. We adjust the unroller
5185// MaxCount preference below to attempt to ensure unrolling doesn't create too
5186// many strided loads.
5187static void
5190 enum { MaxStridedLoads = 7 };
5191 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5192 int StridedLoads = 0;
5193 // FIXME? We could make this more precise by looking at the CFG and
5194 // e.g. not counting loads in each side of an if-then-else diamond.
5195 for (const auto BB : L->blocks()) {
5196 for (auto &I : *BB) {
5197 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5198 if (!LMemI)
5199 continue;
5200
5201 Value *PtrValue = LMemI->getPointerOperand();
5202 if (L->isLoopInvariant(PtrValue))
5203 continue;
5204
5205 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5206 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5207 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5208 continue;
5209
5210 // FIXME? We could take pairing of unrolled load copies into account
5211 // by looking at the AddRec, but we would probably have to limit this
5212 // to loops with no stores or other memory optimization barriers.
5213 ++StridedLoads;
5214 // We've seen enough strided loads that seeing more won't make a
5215 // difference.
5216 if (StridedLoads > MaxStridedLoads / 2)
5217 return StridedLoads;
5218 }
5219 }
5220 return StridedLoads;
5221 };
5222
5223 int StridedLoads = countStridedLoads(L, SE);
5224 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5225 << " strided loads\n");
5226 // Pick the largest power of 2 unroll count that won't result in too many
5227 // strided loads.
5228 if (StridedLoads) {
5229 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5230 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5231 << UP.MaxCount << '\n');
5232 }
5233}
5234
5235// This function returns true if the loop:
5236// 1. Has a valid cost, and
5237// 2. Has a cost within the supplied budget.
5238// Otherwise it returns false.
5240 InstructionCost Budget,
5241 unsigned *FinalSize) {
5242 // Estimate the size of the loop.
5243 InstructionCost LoopCost = 0;
5244
5245 for (auto *BB : L->getBlocks()) {
5246 for (auto &I : *BB) {
5247 SmallVector<const Value *, 4> Operands(I.operand_values());
5248 InstructionCost Cost =
5249 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5250 // This can happen with intrinsics that don't currently have a cost model
5251 // or for some operations that require SVE.
5252 if (!Cost.isValid())
5253 return false;
5254
5255 LoopCost += Cost;
5256 if (LoopCost > Budget)
5257 return false;
5258 }
5259 }
5260
5261 if (FinalSize)
5262 *FinalSize = LoopCost.getValue();
5263 return true;
5264}
5265
5267 const AArch64TTIImpl &TTI) {
5268 // Only consider loops with unknown trip counts for which we can determine
5269 // a symbolic expression. Multi-exit loops with small known trip counts will
5270 // likely be unrolled anyway.
5271 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5273 return false;
5274
5275 // It might not be worth unrolling loops with low max trip counts. Restrict
5276 // this to max trip counts > 32 for now.
5277 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5278 if (MaxTC > 0 && MaxTC <= 32)
5279 return false;
5280
5281 // Make sure the loop size is <= 5.
5282 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5283 return false;
5284
5285 // Small search loops with multiple exits can be highly beneficial to unroll.
5286 // We only care about loops with exactly two exiting blocks, although each
5287 // block could jump to the same exit block.
5288 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5289 if (Blocks.size() != 2)
5290 return false;
5291
5292 if (any_of(Blocks, [](BasicBlock *BB) {
5294 }))
5295 return false;
5296
5297 return true;
5298}
5299
5300/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5301/// OOO engine's wide instruction window and various predictors.
5302static void
5305 const AArch64TTIImpl &TTI) {
5306 // Limit loops with structure that is highly likely to benefit from runtime
5307 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5308 // likely with complex control flow). Note that the heuristics here may be
5309 // overly conservative and we err on the side of avoiding runtime unrolling
5310 // rather than unroll excessively. They are all subject to further refinement.
5311 if (!L->isInnermost() || L->getNumBlocks() > 8)
5312 return;
5313
5314 // Loops with multiple exits are handled by common code.
5315 if (!L->getExitBlock())
5316 return;
5317
5318 // Check if the loop contains any reductions that could be parallelized when
5319 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5320 // a multiple of 2.
5321 bool HasParellelizableReductions =
5322 L->getNumBlocks() == 1 &&
5323 any_of(L->getHeader()->phis(),
5324 [&SE, L](PHINode &Phi) {
5325 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5326 }) &&
5327 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5328 if (HasParellelizableReductions &&
5329 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5330 UP.Partial = true;
5331 UP.MaxCount = 4;
5332 UP.AddAdditionalAccumulators = true;
5333 }
5334
5335 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5337 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5338 SE.getSmallConstantMaxTripCount(L) <= 32))
5339 return;
5340
5341 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5342 return;
5343
5345 return;
5346
5347 // Limit to loops with trip counts that are cheap to expand.
5348 UP.SCEVExpansionBudget = 1;
5349
5350 if (HasParellelizableReductions) {
5351 UP.Runtime = true;
5353 UP.AddAdditionalAccumulators = true;
5354 }
5355
5356 // Try to unroll small loops, of few-blocks with low budget, if they have
5357 // load/store dependencies, to expose more parallel memory access streams,
5358 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5359 BasicBlock *Header = L->getHeader();
5360 BasicBlock *Latch = L->getLoopLatch();
5361 if (Header == Latch) {
5362 // Estimate the size of the loop.
5363 unsigned Size;
5364 unsigned Width = 10;
5365 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5366 return;
5367
5368 // Try to find an unroll count that maximizes the use of the instruction
5369 // window, i.e. trying to fetch as many instructions per cycle as possible.
5370 unsigned MaxInstsPerLine = 16;
5371 unsigned UC = 1;
5372 unsigned BestUC = 1;
5373 unsigned SizeWithBestUC = BestUC * Size;
5374 while (UC <= 8) {
5375 unsigned SizeWithUC = UC * Size;
5376 if (SizeWithUC > 48)
5377 break;
5378 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5379 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5380 BestUC = UC;
5381 SizeWithBestUC = BestUC * Size;
5382 }
5383 UC++;
5384 }
5385
5386 if (BestUC == 1)
5387 return;
5388
5389 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5391 for (auto *BB : L->blocks()) {
5392 for (auto &I : *BB) {
5394 if (!Ptr)
5395 continue;
5396 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5397 if (SE.isLoopInvariant(PtrSCEV, L))
5398 continue;
5399 if (isa<LoadInst>(&I)) {
5400 LoadedValuesPlus.insert(&I);
5401 // Include in-loop 1st users of loaded values.
5402 for (auto *U : I.users())
5403 if (L->contains(cast<Instruction>(U)))
5404 LoadedValuesPlus.insert(U);
5405 } else
5406 Stores.push_back(cast<StoreInst>(&I));
5407 }
5408 }
5409
5410 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5411 return LoadedValuesPlus.contains(SI->getOperand(0));
5412 }))
5413 return;
5414
5415 UP.Runtime = true;
5416 UP.DefaultUnrollRuntimeCount = BestUC;
5417 return;
5418 }
5419
5420 // Try to runtime-unroll loops with early-continues depending on loop-varying
5421 // loads; this helps with branch-prediction for the early-continues.
5422 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5424 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5425 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5426 return;
5427
5428 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5429 [&](Instruction *I, unsigned Depth) -> bool {
5430 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5431 return false;
5432
5433 if (isa<LoadInst>(I))
5434 return true;
5435
5436 return any_of(I->operands(), [&](Value *V) {
5437 auto *I = dyn_cast<Instruction>(V);
5438 return I && DependsOnLoopLoad(I, Depth + 1);
5439 });
5440 };
5441 CmpPredicate Pred;
5442 Instruction *I;
5443 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5444 m_Value())) &&
5445 DependsOnLoopLoad(I, 0)) {
5446 UP.Runtime = true;
5447 }
5448}
5449
5452 OptimizationRemarkEmitter *ORE) const {
5453 // Enable partial unrolling and runtime unrolling.
5454 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5455
5456 UP.UpperBound = true;
5457
5458 // For inner loop, it is more likely to be a hot one, and the runtime check
5459 // can be promoted out from LICM pass, so the overhead is less, let's try
5460 // a larger threshold to unroll more loops.
5461 if (L->getLoopDepth() > 1)
5462 UP.PartialThreshold *= 2;
5463
5464 // Disable partial & runtime unrolling on -Os.
5466
5467 // Scan the loop: don't unroll loops with calls as this could prevent
5468 // inlining. Don't unroll auto-vectorized loops either, though do allow
5469 // unrolling of the scalar remainder.
5470 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5472 for (auto *BB : L->getBlocks()) {
5473 for (auto &I : *BB) {
5474 // Both auto-vectorized loops and the scalar remainder have the
5475 // isvectorized attribute, so differentiate between them by the presence
5476 // of vector instructions.
5477 if (IsVectorized && I.getType()->isVectorTy())
5478 return;
5479 if (isa<CallBase>(I)) {
5482 if (!isLoweredToCall(F))
5483 continue;
5484 return;
5485 }
5486
5487 SmallVector<const Value *, 4> Operands(I.operand_values());
5488 Cost += getInstructionCost(&I, Operands,
5490 }
5491 }
5492
5493 // Apply subtarget-specific unrolling preferences.
5494 if (ST->isAppleMLike())
5495 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5496 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5499
5500 // If this is a small, multi-exit loop similar to something like std::find,
5501 // then there is typically a performance improvement achieved by unrolling.
5502 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5503 UP.RuntimeUnrollMultiExit = true;
5504 UP.Runtime = true;
5505 // Limit unroll count.
5507 // Allow slightly more costly trip-count expansion to catch search loops
5508 // with pointer inductions.
5509 UP.SCEVExpansionBudget = 5;
5510 return;
5511 }
5512
5513 // Enable runtime unrolling for in-order models
5514 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5515 // checking for that case, we can ensure that the default behaviour is
5516 // unchanged
5517 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5518 !ST->getSchedModel().isOutOfOrder()) {
5519 UP.Runtime = true;
5520 UP.Partial = true;
5521 UP.UnrollRemainder = true;
5523
5524 UP.UnrollAndJam = true;
5526 }
5527
5528 // Force unrolling small loops can be very useful because of the branch
5529 // taken cost of the backedge.
5531 UP.Force = true;
5532}
5533
5538
5540 Type *ExpectedType,
5541 bool CanCreate) const {
5542 switch (Inst->getIntrinsicID()) {
5543 default:
5544 return nullptr;
5545 case Intrinsic::aarch64_neon_st2:
5546 case Intrinsic::aarch64_neon_st3:
5547 case Intrinsic::aarch64_neon_st4: {
5548 // Create a struct type
5549 StructType *ST = dyn_cast<StructType>(ExpectedType);
5550 if (!CanCreate || !ST)
5551 return nullptr;
5552 unsigned NumElts = Inst->arg_size() - 1;
5553 if (ST->getNumElements() != NumElts)
5554 return nullptr;
5555 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5556 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5557 return nullptr;
5558 }
5559 Value *Res = PoisonValue::get(ExpectedType);
5560 IRBuilder<> Builder(Inst);
5561 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5562 Value *L = Inst->getArgOperand(i);
5563 Res = Builder.CreateInsertValue(Res, L, i);
5564 }
5565 return Res;
5566 }
5567 case Intrinsic::aarch64_neon_ld2:
5568 case Intrinsic::aarch64_neon_ld3:
5569 case Intrinsic::aarch64_neon_ld4:
5570 if (Inst->getType() == ExpectedType)
5571 return Inst;
5572 return nullptr;
5573 }
5574}
5575
5577 MemIntrinsicInfo &Info) const {
5578 switch (Inst->getIntrinsicID()) {
5579 default:
5580 break;
5581 case Intrinsic::aarch64_neon_ld2:
5582 case Intrinsic::aarch64_neon_ld3:
5583 case Intrinsic::aarch64_neon_ld4:
5584 Info.ReadMem = true;
5585 Info.WriteMem = false;
5586 Info.PtrVal = Inst->getArgOperand(0);
5587 break;
5588 case Intrinsic::aarch64_neon_st2:
5589 case Intrinsic::aarch64_neon_st3:
5590 case Intrinsic::aarch64_neon_st4:
5591 Info.ReadMem = false;
5592 Info.WriteMem = true;
5593 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5594 break;
5595 }
5596
5597 switch (Inst->getIntrinsicID()) {
5598 default:
5599 return false;
5600 case Intrinsic::aarch64_neon_ld2:
5601 case Intrinsic::aarch64_neon_st2:
5602 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5603 break;
5604 case Intrinsic::aarch64_neon_ld3:
5605 case Intrinsic::aarch64_neon_st3:
5606 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5607 break;
5608 case Intrinsic::aarch64_neon_ld4:
5609 case Intrinsic::aarch64_neon_st4:
5610 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5611 break;
5612 }
5613 return true;
5614}
5615
5616/// See if \p I should be considered for address type promotion. We check if \p
5617/// I is a sext with right type and used in memory accesses. If it used in a
5618/// "complex" getelementptr, we allow it to be promoted without finding other
5619/// sext instructions that sign extended the same initial value. A getelementptr
5620/// is considered as "complex" if it has more than 2 operands.
5622 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5623 bool Considerable = false;
5624 AllowPromotionWithoutCommonHeader = false;
5625 if (!isa<SExtInst>(&I))
5626 return false;
5627 Type *ConsideredSExtType =
5628 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5629 if (I.getType() != ConsideredSExtType)
5630 return false;
5631 // See if the sext is the one with the right type and used in at least one
5632 // GetElementPtrInst.
5633 for (const User *U : I.users()) {
5634 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5635 Considerable = true;
5636 // A getelementptr is considered as "complex" if it has more than 2
5637 // operands. We will promote a SExt used in such complex GEP as we
5638 // expect some computation to be merged if they are done on 64 bits.
5639 if (GEPInst->getNumOperands() > 2) {
5640 AllowPromotionWithoutCommonHeader = true;
5641 break;
5642 }
5643 }
5644 }
5645 return Considerable;
5646}
5647
5649 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5650 if (!VF.isScalable())
5651 return true;
5652
5653 Type *Ty = RdxDesc.getRecurrenceType();
5654 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5655 return false;
5656
5657 switch (RdxDesc.getRecurrenceKind()) {
5658 case RecurKind::Sub:
5660 case RecurKind::Add:
5661 case RecurKind::FAdd:
5662 case RecurKind::And:
5663 case RecurKind::Or:
5664 case RecurKind::Xor:
5665 case RecurKind::SMin:
5666 case RecurKind::SMax:
5667 case RecurKind::UMin:
5668 case RecurKind::UMax:
5669 case RecurKind::FMin:
5670 case RecurKind::FMax:
5671 case RecurKind::FMulAdd:
5672 case RecurKind::AnyOf:
5674 return true;
5675 default:
5676 return false;
5677 }
5678}
5679
5682 FastMathFlags FMF,
5684 // The code-generator is currently not able to handle scalable vectors
5685 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5686 // it. This change will be removed when code-generation for these types is
5687 // sufficiently reliable.
5688 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5689 if (VTy->getElementCount() == ElementCount::getScalable(1))
5691
5692 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5693
5694 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5695 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5696
5697 InstructionCost LegalizationCost = 0;
5698 if (LT.first > 1) {
5699 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5700 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5701 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5702 }
5703
5704 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5705}
5706
5708 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5709 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5710 InstructionCost LegalizationCost = 0;
5711 if (LT.first > 1) {
5712 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5713 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5714 LegalizationCost *= LT.first - 1;
5715 }
5716
5717 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5718 assert(ISD && "Invalid opcode");
5719 // Add the final reduction cost for the legal horizontal reduction
5720 switch (ISD) {
5721 case ISD::ADD:
5722 case ISD::AND:
5723 case ISD::OR:
5724 case ISD::XOR:
5725 case ISD::FADD:
5726 return LegalizationCost + 2;
5727 default:
5729 }
5730}
5731
5734 std::optional<FastMathFlags> FMF,
5736 // The code-generator is currently not able to handle scalable vectors
5737 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5738 // it. This change will be removed when code-generation for these types is
5739 // sufficiently reliable.
5740 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5741 if (VTy->getElementCount() == ElementCount::getScalable(1))
5743
5745 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5746 InstructionCost BaseCost =
5747 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5748 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5749 // end up vectorizing for more computationally intensive loops.
5750 return BaseCost + FixedVTy->getNumElements();
5751 }
5752
5753 if (Opcode != Instruction::FAdd)
5755
5756 auto *VTy = cast<ScalableVectorType>(ValTy);
5758 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5759 Cost *= getMaxNumElements(VTy->getElementCount());
5760 return Cost;
5761 }
5762
5763 if (isa<ScalableVectorType>(ValTy))
5764 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5765
5766 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5767 MVT MTy = LT.second;
5768 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5769 assert(ISD && "Invalid opcode");
5770
5771 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5772 // instructions as twice a normal vector add, plus 1 for each legalization
5773 // step (LT.first). This is the only arithmetic vector reduction operation for
5774 // which we have an instruction.
5775 // OR, XOR and AND costs should match the codegen from:
5776 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5777 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5778 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5779 static const CostTblEntry CostTblNoPairwise[]{
5780 {ISD::ADD, MVT::v8i8, 2},
5781 {ISD::ADD, MVT::v16i8, 2},
5782 {ISD::ADD, MVT::v4i16, 2},
5783 {ISD::ADD, MVT::v8i16, 2},
5784 {ISD::ADD, MVT::v2i32, 2},
5785 {ISD::ADD, MVT::v4i32, 2},
5786 {ISD::ADD, MVT::v2i64, 2},
5787 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5788 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5789 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5790 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5791 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5792 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5793 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5794 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5795 {ISD::XOR, MVT::v16i8, 7},
5796 {ISD::XOR, MVT::v4i16, 4},
5797 {ISD::XOR, MVT::v8i16, 6},
5798 {ISD::XOR, MVT::v2i32, 3},
5799 {ISD::XOR, MVT::v4i32, 5},
5800 {ISD::XOR, MVT::v2i64, 3},
5801 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5802 {ISD::AND, MVT::v16i8, 7},
5803 {ISD::AND, MVT::v4i16, 4},
5804 {ISD::AND, MVT::v8i16, 6},
5805 {ISD::AND, MVT::v2i32, 3},
5806 {ISD::AND, MVT::v4i32, 5},
5807 {ISD::AND, MVT::v2i64, 3},
5808 };
5809 switch (ISD) {
5810 default:
5811 break;
5812 case ISD::FADD:
5813 if (Type *EltTy = ValTy->getScalarType();
5814 // FIXME: For half types without fullfp16 support, this could extend and
5815 // use a fp32 faddp reduction but current codegen unrolls.
5816 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5817 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5818 const unsigned NElts = MTy.getVectorNumElements();
5819 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5820 isPowerOf2_32(NElts))
5821 // Reduction corresponding to series of fadd instructions is lowered to
5822 // series of faddp instructions. faddp has latency/throughput that
5823 // matches fadd instruction and hence, every faddp instruction can be
5824 // considered to have a relative cost = 1 with
5825 // CostKind = TCK_RecipThroughput.
5826 // An faddp will pairwise add vector elements, so the size of input
5827 // vector reduces by half every time, requiring
5828 // #(faddp instructions) = log2_32(NElts).
5829 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5830 }
5831 break;
5832 case ISD::ADD:
5833 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5834 return (LT.first - 1) + Entry->Cost;
5835 break;
5836 case ISD::XOR:
5837 case ISD::AND:
5838 case ISD::OR:
5839 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5840 if (!Entry)
5841 break;
5842 auto *ValVTy = cast<FixedVectorType>(ValTy);
5843 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5844 isPowerOf2_32(ValVTy->getNumElements())) {
5845 InstructionCost ExtraCost = 0;
5846 if (LT.first != 1) {
5847 // Type needs to be split, so there is an extra cost of LT.first - 1
5848 // arithmetic ops.
5849 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5850 MTy.getVectorNumElements());
5851 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5852 ExtraCost *= LT.first - 1;
5853 }
5854 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5855 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5856 return Cost + ExtraCost;
5857 }
5858 break;
5859 }
5860 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5861}
5862
5864 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5865 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5866 EVT VecVT = TLI->getValueType(DL, VecTy);
5867 EVT ResVT = TLI->getValueType(DL, ResTy);
5868
5869 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5870 VecVT.getSizeInBits() >= 64) {
5871 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5872
5873 // The legal cases are:
5874 // UADDLV 8/16/32->32
5875 // UADDLP 32->64
5876 unsigned RevVTSize = ResVT.getSizeInBits();
5877 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5878 RevVTSize <= 32) ||
5879 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5880 RevVTSize <= 32) ||
5881 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5882 RevVTSize <= 64))
5883 return (LT.first - 1) * 2 + 2;
5884 }
5885
5886 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5887 CostKind);
5888}
5889
5891AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5892 Type *ResTy, VectorType *VecTy,
5894 EVT VecVT = TLI->getValueType(DL, VecTy);
5895 EVT ResVT = TLI->getValueType(DL, ResTy);
5896
5897 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5898 RedOpcode == Instruction::Add) {
5899 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5900
5901 // The legal cases with dotprod are
5902 // UDOT 8->32
5903 // Which requires an additional uaddv to sum the i32 values.
5904 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5905 ResVT == MVT::i32)
5906 return LT.first + 2;
5907 }
5908
5909 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5910 CostKind);
5911}
5912
5916 static const CostTblEntry ShuffleTbl[] = {
5917 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5918 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5919 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5920 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5921 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5922 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5923 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5924 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5925 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5926 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5927 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5928 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5929 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5930 };
5931
5932 // The code-generator is currently not able to handle scalable vectors
5933 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5934 // it. This change will be removed when code-generation for these types is
5935 // sufficiently reliable.
5938
5939 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5940 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5941 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5942 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5943 : LT.second;
5944 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5945 InstructionCost LegalizationCost = 0;
5946 if (Index < 0) {
5947 LegalizationCost =
5948 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5950 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5952 }
5953
5954 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5955 // Cost performed on a promoted type.
5956 if (LT.second.getScalarType() == MVT::i1) {
5957 LegalizationCost +=
5958 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5960 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5962 }
5963 const auto *Entry =
5964 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5965 assert(Entry && "Illegal Type for Splice");
5966 LegalizationCost += Entry->Cost;
5967 return LegalizationCost * LT.first;
5968}
5969
5971 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5973 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5974 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5976
5978 return Invalid;
5979
5980 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5981 Opcode != Instruction::FAdd) ||
5982 OpAExtend == TTI::PR_None)
5983 return Invalid;
5984
5985 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5986 // are not allowed.
5987 if (AccumType->isFloatingPointTy()) {
5988 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5989 if (!FMF->allowReassoc() || !FMF->allowContract())
5990 return Invalid;
5991 } else {
5992 assert(!FMF &&
5993 "FastMathFlags only apply to floating-point partial reductions");
5994 }
5995
5996 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5997 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5998 "Unexpected values for OpBExtend or InputTypeB");
5999
6000 // We only support multiply binary operations for now, and for muls we
6001 // require the types being extended to be the same.
6002 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6003 InputTypeA != InputTypeB))
6004 return Invalid;
6005
6006 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6007 if (IsUSDot && !ST->hasMatMulInt8())
6008 // FIXME: Remove this early bailout in favour of expand cost.
6009 return Invalid;
6010
6011 unsigned Ratio =
6012 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6013 if (VF.getKnownMinValue() <= Ratio)
6014 return Invalid;
6015
6016 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6017 VectorType *AccumVectorType =
6018 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6019 // We don't yet support all kinds of legalization.
6020 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6021 EVT::getEVT(AccumVectorType));
6022 switch (TC.first) {
6023 default:
6024 return Invalid;
6028 // The legalised type (e.g. after splitting) must be legal too.
6029 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6031 return Invalid;
6032 break;
6033 }
6034
6035 std::pair<InstructionCost, MVT> AccumLT =
6036 getTypeLegalizationCost(AccumVectorType);
6037 std::pair<InstructionCost, MVT> InputLT =
6038 getTypeLegalizationCost(InputVectorType);
6039
6040 // Returns true if the subtarget supports the operation for a given type.
6041 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6042 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6043 (AccumLT.second.isFixedLengthVector() &&
6044 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6045 NEONPred);
6046 };
6047
6048 bool IsSub = Opcode == Instruction::Sub;
6049 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6050
6051 if (AccumLT.second.getScalarType() == MVT::i32 &&
6052 InputLT.second.getScalarType() == MVT::i8 && !IsSub) {
6053 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6054 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6055 return Cost;
6056 // i8 -> i32 usdot requires +i8mm
6057 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6058 return Cost;
6059 }
6060
6061 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot && !IsSub) {
6062 // i16 -> i64 is natively supported for udot/sdot
6063 if (AccumLT.second.getScalarType() == MVT::i64 &&
6064 InputLT.second.getScalarType() == MVT::i16)
6065 return Cost;
6066 // i16 -> i32 is natively supported with SVE2p1
6067 if (AccumLT.second.getScalarType() == MVT::i32 &&
6068 InputLT.second.getScalarType() == MVT::i16 &&
6069 (ST->hasSVE2p1() || ST->hasSME2()))
6070 return Cost;
6071 // i8 -> i64 is supported with an extra level of extends
6072 if (AccumLT.second.getScalarType() == MVT::i64 &&
6073 InputLT.second.getScalarType() == MVT::i8)
6074 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6075 // because it requires two extra extends on the inputs. But if we'd change
6076 // that now, a regular reduction would be cheaper because the costs of
6077 // the extends in the IR are still counted. This can be fixed
6078 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6079 return Cost;
6080 // i8 -> i16 is natively supported with SVE2p3
6081 if (AccumLT.second.getScalarType() == MVT::i16 &&
6082 InputLT.second.getScalarType() == MVT::i8 &&
6083 (ST->hasSVE2p3() || ST->hasSME2p3()))
6084 return Cost;
6085 }
6086
6087 // f16 -> f32 is natively supported for fdot using either
6088 // SVE or NEON instruction.
6089 if (Opcode == Instruction::FAdd && !IsSub &&
6090 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6091 AccumLT.second.getScalarType() == MVT::f32 &&
6092 InputLT.second.getScalarType() == MVT::f16)
6093 return Cost;
6094
6095 // For a ratio of 2, we can use *mlal top/bottom instructions.
6096 if (Ratio == 2 && !IsSub) {
6097 MVT InVT = InputLT.second.getScalarType();
6098
6099 // SVE2 [us]mlalb/t and NEON [us]mlal(2)
6100 if (IsSupported(ST->hasSVE2(), true) &&
6101 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6102 return Cost * 2;
6103
6104 // SVE2 fmlalb/t and NEON fmlal(2)
6105 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6106 return Cost * 2;
6107
6108 // SVE and NEON bfmlalb/t
6109 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6110 return Cost * 2;
6111 }
6112
6114 Opcode, InputTypeA, InputTypeB, AccumType, VF, OpAExtend, OpBExtend,
6115 BinOp, CostKind, FMF);
6116
6117 // Slightly lower the cost of a sub reduction so that it can be considered
6118 // as candidate for 'cdot' operations. This is a somewhat arbitrary number,
6119 // because we don't yet model these operations directly.
6120 return ExpandCost.isValid() && IsSub ? ((8 * ExpandCost) / 10) : ExpandCost;
6121}
6122
6125 VectorType *SrcTy, ArrayRef<int> Mask,
6126 TTI::TargetCostKind CostKind, int Index,
6128 const Instruction *CxtI) const {
6129 assert((Mask.empty() || DstTy->isScalableTy() ||
6130 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6131 "Expected the Mask to match the return size if given");
6132 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6133 "Expected the same scalar types");
6134 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6135
6136 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6137 // into smaller vectors and sum the cost of each shuffle.
6138 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6139 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6140 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6141 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6142 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6143 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6144 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6145 // cost than just the load.
6146 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6149 return std::max<InstructionCost>(1, LT.first / 4);
6150
6151 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6152 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6153 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6154 // cost than just the store.
6155 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6157 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6159 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6160 return LT.first;
6161
6162 unsigned TpNumElts = Mask.size();
6163 unsigned LTNumElts = LT.second.getVectorNumElements();
6164 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6165 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6166 LT.second.getVectorElementCount());
6168 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6169 PreviousCosts;
6170 for (unsigned N = 0; N < NumVecs; N++) {
6171 SmallVector<int> NMask;
6172 // Split the existing mask into chunks of size LTNumElts. Track the source
6173 // sub-vectors to ensure the result has at most 2 inputs.
6174 unsigned Source1 = -1U, Source2 = -1U;
6175 unsigned NumSources = 0;
6176 for (unsigned E = 0; E < LTNumElts; E++) {
6177 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6179 if (MaskElt < 0) {
6181 continue;
6182 }
6183
6184 // Calculate which source from the input this comes from and whether it
6185 // is new to us.
6186 unsigned Source = MaskElt / LTNumElts;
6187 if (NumSources == 0) {
6188 Source1 = Source;
6189 NumSources = 1;
6190 } else if (NumSources == 1 && Source != Source1) {
6191 Source2 = Source;
6192 NumSources = 2;
6193 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6194 NumSources++;
6195 }
6196
6197 // Add to the new mask. For the NumSources>2 case these are not correct,
6198 // but are only used for the modular lane number.
6199 if (Source == Source1)
6200 NMask.push_back(MaskElt % LTNumElts);
6201 else if (Source == Source2)
6202 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6203 else
6204 NMask.push_back(MaskElt % LTNumElts);
6205 }
6206 // Check if we have already generated this sub-shuffle, which means we
6207 // will have already generated the output. For example a <16 x i32> splat
6208 // will be the same sub-splat 4 times, which only needs to be generated
6209 // once and reused.
6210 auto Result =
6211 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6212 // Check if it was already in the map (already costed).
6213 if (!Result.second)
6214 continue;
6215 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6216 // getShuffleCost. If not then cost it using the worst case as the number
6217 // of element moves into a new vector.
6218 InstructionCost NCost =
6219 NumSources <= 2
6220 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6222 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6223 CxtI)
6224 : LTNumElts;
6225 Result.first->second = NCost;
6226 Cost += NCost;
6227 }
6228 return Cost;
6229 }
6230
6231 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6232 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6233 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6234 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6235 // This currently only handles low or high extracts to prevent SLP vectorizer
6236 // regressions.
6237 // Note that SVE's ext instruction is destructive, but it can be fused with
6238 // a movprfx to act like a constructive instruction.
6239 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6240 if (LT.second.getFixedSizeInBits() >= 128 &&
6241 cast<FixedVectorType>(SubTp)->getNumElements() ==
6242 LT.second.getVectorNumElements() / 2) {
6243 if (Index == 0)
6244 return 0;
6245 if (Index == (int)LT.second.getVectorNumElements() / 2)
6246 return 1;
6247 }
6249 }
6250 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6251 // the code to handle length-changing shuffles.
6252 if (Kind == TTI::SK_InsertSubvector) {
6253 LT = getTypeLegalizationCost(DstTy);
6254 SrcTy = DstTy;
6255 }
6256
6257 // Check for identity masks, which we can treat as free for both fixed and
6258 // scalable vector paths.
6259 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6260 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6261 all_of(enumerate(Mask), [](const auto &M) {
6262 return M.value() < 0 || M.value() == (int)M.index();
6263 }))
6264 return 0;
6265
6266 // Segmented shuffle matching.
6267 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6268 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6269 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6271
6273 unsigned Segments =
6275 unsigned SegmentElts = VTy->getNumElements() / Segments;
6276
6277 // dupq zd.t, zn.t[idx]
6278 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6279 ST->isSVEorStreamingSVEAvailable() &&
6280 isDUPQMask(Mask, Segments, SegmentElts))
6281 return LT.first;
6282
6283 // mov zd.q, vn
6284 if (ST->isSVEorStreamingSVEAvailable() &&
6285 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6286 return LT.first;
6287 }
6288
6289 // Check for broadcast loads, which are supported by the LD1R instruction.
6290 // In terms of code-size, the shuffle vector is free when a load + dup get
6291 // folded into a LD1R. That's what we check and return here. For performance
6292 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6293 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6294 // that we model the load + dup sequence slightly higher because LD1R is a
6295 // high latency instruction.
6296 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6297 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6298 if (IsLoad && LT.second.isVector() &&
6299 isLegalBroadcastLoad(SrcTy->getElementType(),
6300 LT.second.getVectorElementCount()))
6301 return 0;
6302 }
6303
6304 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6305 // from the perfect shuffle tables.
6306 if (Mask.size() == 4 &&
6307 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6308 (SrcTy->getScalarSizeInBits() == 16 ||
6309 SrcTy->getScalarSizeInBits() == 32) &&
6310 all_of(Mask, [](int E) { return E < 8; }))
6311 return getPerfectShuffleCost(Mask);
6312
6313 // Check for other shuffles that are not SK_ kinds but we have native
6314 // instructions for, for example ZIP and UZP.
6315 unsigned Unused;
6316 if (LT.second.isFixedLengthVector() &&
6317 LT.second.getVectorNumElements() == Mask.size() &&
6318 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6319 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6320 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6321 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6322 Kind == TTI::SK_InsertSubvector) &&
6323 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6324 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6325 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6326 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6327 LT.second.getVectorNumElements(), 16) ||
6328 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6329 LT.second.getVectorNumElements(), 32) ||
6330 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6331 LT.second.getVectorNumElements(), 64) ||
6332 // Check for non-zero lane splats
6333 all_of(drop_begin(Mask),
6334 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6335 return 1;
6336
6337 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6338 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6339 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6340 static const CostTblEntry ShuffleTbl[] = {
6341 // Broadcast shuffle kinds can be performed with 'dup'.
6342 {TTI::SK_Broadcast, MVT::v8i8, 1},
6343 {TTI::SK_Broadcast, MVT::v16i8, 1},
6344 {TTI::SK_Broadcast, MVT::v4i16, 1},
6345 {TTI::SK_Broadcast, MVT::v8i16, 1},
6346 {TTI::SK_Broadcast, MVT::v2i32, 1},
6347 {TTI::SK_Broadcast, MVT::v4i32, 1},
6348 {TTI::SK_Broadcast, MVT::v2i64, 1},
6349 {TTI::SK_Broadcast, MVT::v4f16, 1},
6350 {TTI::SK_Broadcast, MVT::v8f16, 1},
6351 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6352 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6353 {TTI::SK_Broadcast, MVT::v2f32, 1},
6354 {TTI::SK_Broadcast, MVT::v4f32, 1},
6355 {TTI::SK_Broadcast, MVT::v2f64, 1},
6356 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6357 // 'zip1/zip2' instructions.
6358 {TTI::SK_Transpose, MVT::v8i8, 1},
6359 {TTI::SK_Transpose, MVT::v16i8, 1},
6360 {TTI::SK_Transpose, MVT::v4i16, 1},
6361 {TTI::SK_Transpose, MVT::v8i16, 1},
6362 {TTI::SK_Transpose, MVT::v2i32, 1},
6363 {TTI::SK_Transpose, MVT::v4i32, 1},
6364 {TTI::SK_Transpose, MVT::v2i64, 1},
6365 {TTI::SK_Transpose, MVT::v4f16, 1},
6366 {TTI::SK_Transpose, MVT::v8f16, 1},
6367 {TTI::SK_Transpose, MVT::v4bf16, 1},
6368 {TTI::SK_Transpose, MVT::v8bf16, 1},
6369 {TTI::SK_Transpose, MVT::v2f32, 1},
6370 {TTI::SK_Transpose, MVT::v4f32, 1},
6371 {TTI::SK_Transpose, MVT::v2f64, 1},
6372 // Select shuffle kinds.
6373 // TODO: handle vXi8/vXi16.
6374 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6375 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6376 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6377 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6378 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6379 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6380 // PermuteSingleSrc shuffle kinds.
6381 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6382 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6383 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6384 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6385 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6386 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6387 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6388 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6389 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6390 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6391 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6392 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6393 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6394 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6395 // Reverse can be lowered with `rev`.
6396 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6397 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6398 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6399 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6400 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6401 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6402 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6403 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6404 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6405 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6406 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6407 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6408 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6409 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6410 // Splice can all be lowered as `ext`.
6411 {TTI::SK_Splice, MVT::v2i32, 1},
6412 {TTI::SK_Splice, MVT::v4i32, 1},
6413 {TTI::SK_Splice, MVT::v2i64, 1},
6414 {TTI::SK_Splice, MVT::v2f32, 1},
6415 {TTI::SK_Splice, MVT::v4f32, 1},
6416 {TTI::SK_Splice, MVT::v2f64, 1},
6417 {TTI::SK_Splice, MVT::v8f16, 1},
6418 {TTI::SK_Splice, MVT::v8bf16, 1},
6419 {TTI::SK_Splice, MVT::v8i16, 1},
6420 {TTI::SK_Splice, MVT::v16i8, 1},
6421 {TTI::SK_Splice, MVT::v4f16, 1},
6422 {TTI::SK_Splice, MVT::v4bf16, 1},
6423 {TTI::SK_Splice, MVT::v4i16, 1},
6424 {TTI::SK_Splice, MVT::v8i8, 1},
6425 // Broadcast shuffle kinds for scalable vectors
6426 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6427 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6428 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6429 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6430 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6431 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6432 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6433 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6434 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6435 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6436 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6437 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6438 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6439 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6440 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6441 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6442 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6443 // Handle the cases for vector.reverse with scalable vectors
6444 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6445 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6446 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6447 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6448 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6449 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6450 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6451 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6452 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6453 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6454 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6455 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6456 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6457 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6458 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6459 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6460 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6461 };
6462 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6463 return LT.first * Entry->Cost;
6464 }
6465
6466 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6467 return getSpliceCost(SrcTy, Index, CostKind);
6468
6469 // Inserting a subvector can often be done with either a D, S or H register
6470 // move, so long as the inserted vector is "aligned".
6471 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6472 LT.second.getSizeInBits() <= 128 && SubTp) {
6473 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6474 if (SubLT.second.isVector()) {
6475 int NumElts = LT.second.getVectorNumElements();
6476 int NumSubElts = SubLT.second.getVectorNumElements();
6477 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6478 return SubLT.first;
6479 }
6480 }
6481
6482 // Restore optimal kind.
6483 if (IsExtractSubvector)
6485 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6486 Args, CxtI);
6487}
6488
6491 const DominatorTree &DT) {
6492 const auto &Strides = DenseMap<Value *, const SCEV *>();
6493 for (BasicBlock *BB : TheLoop->blocks()) {
6494 // Scan the instructions in the block and look for addresses that are
6495 // consecutive and decreasing.
6496 for (Instruction &I : *BB) {
6497 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6499 Type *AccessTy = getLoadStoreType(&I);
6500 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6501 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6502 .value_or(0) < 0)
6503 return true;
6504 }
6505 }
6506 }
6507 return false;
6508}
6509
6511 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6513 // For cases like post-LTO vectorization, when we eventually know the trip
6514 // count, epilogue with fixed-width vectorization can be deleted if the trip
6515 // count is less than the epilogue iterations. That's why we prefer
6516 // fixed-width vectorization in epilogue in case of equal costs.
6517 if (IsEpilogue)
6518 return true;
6519 return ST->useFixedOverScalableIfEqualCost();
6520}
6521
6523 return ST->getEpilogueVectorizationMinVF();
6524}
6525
6527 if (!ST->hasSVE())
6528 return false;
6529
6530 // We don't currently support vectorisation with interleaving for SVE - with
6531 // such loops we're better off not using tail-folding. This gives us a chance
6532 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6533 if (TFI->IAI->hasGroups())
6534 return false;
6535
6537 if (TFI->LVL->getReductionVars().size())
6539 if (TFI->LVL->getFixedOrderRecurrences().size())
6541
6542 // We call this to discover whether any load/store pointers in the loop have
6543 // negative strides. This will require extra work to reverse the loop
6544 // predicate, which may be expensive.
6547 *TFI->LVL->getDominatorTree()))
6551
6552 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6553 Required))
6554 return false;
6555
6556 // Don't tail-fold for tight loops where we would be better off interleaving
6557 // with an unpredicated loop.
6558 unsigned NumInsns = 0;
6559 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6560 NumInsns += BB->size();
6561 }
6562
6563 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6564 return NumInsns >= SVETailFoldInsnThreshold;
6565}
6566
6569 StackOffset BaseOffset, bool HasBaseReg,
6570 int64_t Scale, unsigned AddrSpace) const {
6571 // Scaling factors are not free at all.
6572 // Operands | Rt Latency
6573 // -------------------------------------------
6574 // Rt, [Xn, Xm] | 4
6575 // -------------------------------------------
6576 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6577 // Rt, [Xn, Wm, <extend> #imm] |
6579 AM.BaseGV = BaseGV;
6580 AM.BaseOffs = BaseOffset.getFixed();
6581 AM.HasBaseReg = HasBaseReg;
6582 AM.Scale = Scale;
6583 AM.ScalableOffset = BaseOffset.getScalable();
6584 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6585 // Scale represents reg2 * scale, thus account for 1 if
6586 // it is not equal to 0 or 1.
6587 return AM.Scale != 0 && AM.Scale != 1;
6589}
6590
6592 const Instruction *I) const {
6594 // For the binary operators (e.g. or) we need to be more careful than
6595 // selects, here we only transform them if they are already at a natural
6596 // break point in the code - the end of a block with an unconditional
6597 // terminator.
6598 if (I->getOpcode() == Instruction::Or &&
6599 isa<UncondBrInst>(I->getNextNode()))
6600 return true;
6601
6602 if (I->getOpcode() == Instruction::Add ||
6603 I->getOpcode() == Instruction::Sub)
6604 return true;
6605 }
6607}
6608
6611 const TargetTransformInfo::LSRCost &C2) const {
6612 // AArch64 specific here is adding the number of instructions to the
6613 // comparison (though not as the first consideration, as some targets do)
6614 // along with changing the priority of the base additions.
6615 // TODO: Maybe a more nuanced tradeoff between instruction count
6616 // and number of registers? To be investigated at a later date.
6617 if (EnableLSRCostOpt)
6618 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6619 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6620 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6621 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6622
6624}
6625
6626static bool isSplatShuffle(Value *V) {
6627 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6628 return all_equal(Shuf->getShuffleMask());
6629 return false;
6630}
6631
6632/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6633/// or upper half of the vector elements.
6634static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6635 bool AllowSplat = false) {
6636 // Scalable types can't be extract shuffle vectors.
6637 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6638 return false;
6639
6640 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6641 auto *FullTy = FullV->getType();
6642 auto *HalfTy = HalfV->getType();
6643 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6644 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6645 };
6646
6647 auto extractHalf = [](Value *FullV, Value *HalfV) {
6648 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6649 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6650 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6651 };
6652
6653 ArrayRef<int> M1, M2;
6654 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6655 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6656 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6657 return false;
6658
6659 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6660 // it is not checked as an extract below.
6661 if (AllowSplat && isSplatShuffle(Op1))
6662 S1Op1 = nullptr;
6663 if (AllowSplat && isSplatShuffle(Op2))
6664 S2Op1 = nullptr;
6665
6666 // Check that the operands are half as wide as the result and we extract
6667 // half of the elements of the input vectors.
6668 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6669 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6670 return false;
6671
6672 // Check the mask extracts either the lower or upper half of vector
6673 // elements.
6674 int M1Start = 0;
6675 int M2Start = 0;
6676 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6677 if ((S1Op1 &&
6678 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6679 (S2Op1 &&
6680 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6681 return false;
6682
6683 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6684 (M2Start != 0 && M2Start != (NumElements / 2)))
6685 return false;
6686 if (S1Op1 && S2Op1 && M1Start != M2Start)
6687 return false;
6688
6689 return true;
6690}
6691
6692/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6693/// of the vector elements.
6694static bool areExtractExts(Value *Ext1, Value *Ext2) {
6695 auto areExtDoubled = [](Instruction *Ext) {
6696 return Ext->getType()->getScalarSizeInBits() ==
6697 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6698 };
6699
6700 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6701 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6702 !areExtDoubled(cast<Instruction>(Ext1)) ||
6703 !areExtDoubled(cast<Instruction>(Ext2)))
6704 return false;
6705
6706 return true;
6707}
6708
6709/// Check if Op could be used with vmull_high_p64 intrinsic.
6711 Value *VectorOperand = nullptr;
6712 ConstantInt *ElementIndex = nullptr;
6713 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6714 m_ConstantInt(ElementIndex))) &&
6715 ElementIndex->getValue() == 1 &&
6716 isa<FixedVectorType>(VectorOperand->getType()) &&
6717 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6718}
6719
6720/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6721static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6723}
6724
6726 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6727 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6728 if (!GEP || GEP->getNumOperands() != 2)
6729 return false;
6730
6731 Value *Base = GEP->getOperand(0);
6732 Value *Offsets = GEP->getOperand(1);
6733
6734 // We only care about scalar_base+vector_offsets.
6735 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6736 return false;
6737
6738 // Sink extends that would allow us to use 32-bit offset vectors.
6739 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6740 auto *OffsetsInst = cast<Instruction>(Offsets);
6741 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6742 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6743 Ops.push_back(&GEP->getOperandUse(1));
6744 }
6745
6746 // Sink the GEP.
6747 return true;
6748}
6749
6750/// We want to sink following cases:
6751/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6752/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6754 if (match(Op, m_VScale()))
6755 return true;
6756 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6758 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6759 return true;
6760 }
6761 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6763 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6764 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6765 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6766 return true;
6767 }
6768 return false;
6769}
6770
6771static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6772
6773/// Check if sinking \p I's operands to I's basic block is profitable, because
6774/// the operands can be folded into a target instruction, e.g.
6775/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6779 switch (II->getIntrinsicID()) {
6780 case Intrinsic::aarch64_neon_smull:
6781 case Intrinsic::aarch64_neon_umull:
6782 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6783 /*AllowSplat=*/true)) {
6784 Ops.push_back(&II->getOperandUse(0));
6785 Ops.push_back(&II->getOperandUse(1));
6786 return true;
6787 }
6788 [[fallthrough]];
6789
6790 case Intrinsic::fma:
6791 case Intrinsic::fmuladd:
6792 if (isa<VectorType>(I->getType()) &&
6793 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6794 !ST->hasFullFP16())
6795 return false;
6796
6797 if (isFNeg(II->getOperand(0)))
6798 Ops.push_back(&II->getOperandUse(0));
6799 if (isFNeg(II->getOperand(1)))
6800 Ops.push_back(&II->getOperandUse(1));
6801
6802 [[fallthrough]];
6803 case Intrinsic::aarch64_neon_sqdmull:
6804 case Intrinsic::aarch64_neon_sqdmulh:
6805 case Intrinsic::aarch64_neon_sqrdmulh:
6806 // Sink splats for index lane variants
6807 if (isSplatShuffle(II->getOperand(0)))
6808 Ops.push_back(&II->getOperandUse(0));
6809 if (isSplatShuffle(II->getOperand(1)))
6810 Ops.push_back(&II->getOperandUse(1));
6811 return !Ops.empty();
6812 case Intrinsic::aarch64_neon_fmlal:
6813 case Intrinsic::aarch64_neon_fmlal2:
6814 case Intrinsic::aarch64_neon_fmlsl:
6815 case Intrinsic::aarch64_neon_fmlsl2:
6816 // Sink splats for index lane variants
6817 if (isSplatShuffle(II->getOperand(1)))
6818 Ops.push_back(&II->getOperandUse(1));
6819 if (isSplatShuffle(II->getOperand(2)))
6820 Ops.push_back(&II->getOperandUse(2));
6821 return !Ops.empty();
6822 case Intrinsic::aarch64_sve_ptest_first:
6823 case Intrinsic::aarch64_sve_ptest_last:
6824 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6825 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6826 Ops.push_back(&II->getOperandUse(0));
6827 return !Ops.empty();
6828 case Intrinsic::aarch64_sme_write_horiz:
6829 case Intrinsic::aarch64_sme_write_vert:
6830 case Intrinsic::aarch64_sme_writeq_horiz:
6831 case Intrinsic::aarch64_sme_writeq_vert: {
6832 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6833 if (!Idx || Idx->getOpcode() != Instruction::Add)
6834 return false;
6835 Ops.push_back(&II->getOperandUse(1));
6836 return true;
6837 }
6838 case Intrinsic::aarch64_sme_read_horiz:
6839 case Intrinsic::aarch64_sme_read_vert:
6840 case Intrinsic::aarch64_sme_readq_horiz:
6841 case Intrinsic::aarch64_sme_readq_vert:
6842 case Intrinsic::aarch64_sme_ld1b_vert:
6843 case Intrinsic::aarch64_sme_ld1h_vert:
6844 case Intrinsic::aarch64_sme_ld1w_vert:
6845 case Intrinsic::aarch64_sme_ld1d_vert:
6846 case Intrinsic::aarch64_sme_ld1q_vert:
6847 case Intrinsic::aarch64_sme_st1b_vert:
6848 case Intrinsic::aarch64_sme_st1h_vert:
6849 case Intrinsic::aarch64_sme_st1w_vert:
6850 case Intrinsic::aarch64_sme_st1d_vert:
6851 case Intrinsic::aarch64_sme_st1q_vert:
6852 case Intrinsic::aarch64_sme_ld1b_horiz:
6853 case Intrinsic::aarch64_sme_ld1h_horiz:
6854 case Intrinsic::aarch64_sme_ld1w_horiz:
6855 case Intrinsic::aarch64_sme_ld1d_horiz:
6856 case Intrinsic::aarch64_sme_ld1q_horiz:
6857 case Intrinsic::aarch64_sme_st1b_horiz:
6858 case Intrinsic::aarch64_sme_st1h_horiz:
6859 case Intrinsic::aarch64_sme_st1w_horiz:
6860 case Intrinsic::aarch64_sme_st1d_horiz:
6861 case Intrinsic::aarch64_sme_st1q_horiz: {
6862 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6863 if (!Idx || Idx->getOpcode() != Instruction::Add)
6864 return false;
6865 Ops.push_back(&II->getOperandUse(3));
6866 return true;
6867 }
6868 case Intrinsic::aarch64_neon_pmull:
6869 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6870 return false;
6871 Ops.push_back(&II->getOperandUse(0));
6872 Ops.push_back(&II->getOperandUse(1));
6873 return true;
6874 case Intrinsic::aarch64_neon_pmull64:
6875 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6876 II->getArgOperand(1)))
6877 return false;
6878 Ops.push_back(&II->getArgOperandUse(0));
6879 Ops.push_back(&II->getArgOperandUse(1));
6880 return true;
6881 case Intrinsic::masked_gather:
6882 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6883 return false;
6884 Ops.push_back(&II->getArgOperandUse(0));
6885 return true;
6886 case Intrinsic::masked_scatter:
6887 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6888 return false;
6889 Ops.push_back(&II->getArgOperandUse(1));
6890 return true;
6891 default:
6892 return false;
6893 }
6894 }
6895
6896 auto ShouldSinkCondition = [](Value *Cond,
6897 SmallVectorImpl<Use *> &Ops) -> bool {
6899 return false;
6901 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6902 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6903 return false;
6904 if (isa<CmpInst>(II->getOperand(0)))
6905 Ops.push_back(&II->getOperandUse(0));
6906 return true;
6907 };
6908
6909 switch (I->getOpcode()) {
6910 case Instruction::GetElementPtr:
6911 case Instruction::Add:
6912 case Instruction::Sub:
6913 // Sink vscales closer to uses for better isel
6914 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6915 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6916 Ops.push_back(&I->getOperandUse(Op));
6917 return true;
6918 }
6919 }
6920 break;
6921 case Instruction::Select: {
6922 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6923 return false;
6924
6925 Ops.push_back(&I->getOperandUse(0));
6926 return true;
6927 }
6928 case Instruction::UncondBr:
6929 return false;
6930 case Instruction::CondBr: {
6931 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
6932 return false;
6933
6934 Ops.push_back(&I->getOperandUse(0));
6935 return true;
6936 }
6937 case Instruction::FMul:
6938 // fmul with contract flag can be combined with fadd into fma.
6939 // Sinking fneg into this block enables fmls pattern.
6940 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6941 if (isFNeg(I->getOperand(0)))
6942 Ops.push_back(&I->getOperandUse(0));
6943 if (isFNeg(I->getOperand(1)))
6944 Ops.push_back(&I->getOperandUse(1));
6945 }
6946 break;
6947
6948 // Type | BIC | ORN | EON
6949 // ----------------+-----------+-----------+-----------
6950 // scalar | Base | Base | Base
6951 // scalar w/shift | - | - | -
6952 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
6953 // scalable vector | SVE | - | BSL2N
6954 case Instruction::Xor:
6955 // EON only for scalars (possibly expanded fixed vectors)
6956 // and vectors using the SVE2/SME BSL2N instruction.
6957 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
6958 bool HasBSL2N =
6959 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
6960 if (!HasBSL2N)
6961 break;
6962 }
6963 [[fallthrough]];
6964 case Instruction::And:
6965 case Instruction::Or:
6966 // Even though we could use the SVE2/SME BSL2N instruction,
6967 // it might pessimize with an extra MOV depending on register allocation.
6968 if (I->getOpcode() == Instruction::Or &&
6969 isa<ScalableVectorType>(I->getType()))
6970 break;
6971 // Shift can be fold into scalar AND/ORR/EOR,
6972 // but not the non-negated operand of BIC/ORN/EON.
6973 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
6975 break;
6976 for (auto &Op : I->operands()) {
6977 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
6978 if (match(Op.get(), m_Not(m_Value()))) {
6979 Ops.push_back(&Op);
6980 return true;
6981 }
6982 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
6983 if (match(Op.get(),
6985 m_Value(), m_ZeroMask()))) {
6986 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
6987 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
6988 Ops.push_back(&Not);
6989 Ops.push_back(&InsertElt);
6990 Ops.push_back(&Op);
6991 return true;
6992 }
6993 }
6994 break;
6995 default:
6996 break;
6997 }
6998
6999 if (!I->getType()->isVectorTy())
7000 return !Ops.empty();
7001
7002 switch (I->getOpcode()) {
7003 case Instruction::Sub:
7004 case Instruction::Add: {
7005 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7006 return false;
7007
7008 // If the exts' operands extract either the lower or upper elements, we
7009 // can sink them too.
7010 auto Ext1 = cast<Instruction>(I->getOperand(0));
7011 auto Ext2 = cast<Instruction>(I->getOperand(1));
7012 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7013 Ops.push_back(&Ext1->getOperandUse(0));
7014 Ops.push_back(&Ext2->getOperandUse(0));
7015 }
7016
7017 Ops.push_back(&I->getOperandUse(0));
7018 Ops.push_back(&I->getOperandUse(1));
7019
7020 return true;
7021 }
7022 case Instruction::Or: {
7023 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7024 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7025 if (ST->hasNEON()) {
7026 Instruction *OtherAnd, *IA, *IB;
7027 Value *MaskValue;
7028 // MainAnd refers to And instruction that has 'Not' as one of its operands
7029 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7030 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7031 m_Instruction(IA)))))) {
7032 if (match(OtherAnd,
7033 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7034 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7035 ? cast<Instruction>(I->getOperand(1))
7036 : cast<Instruction>(I->getOperand(0));
7037
7038 // Both Ands should be in same basic block as Or
7039 if (I->getParent() != MainAnd->getParent() ||
7040 I->getParent() != OtherAnd->getParent())
7041 return false;
7042
7043 // Non-mask operands of both Ands should also be in same basic block
7044 if (I->getParent() != IA->getParent() ||
7045 I->getParent() != IB->getParent())
7046 return false;
7047
7048 Ops.push_back(
7049 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7050 Ops.push_back(&I->getOperandUse(0));
7051 Ops.push_back(&I->getOperandUse(1));
7052
7053 return true;
7054 }
7055 }
7056 }
7057
7058 return false;
7059 }
7060 case Instruction::Mul: {
7061 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7062 auto *Ty = cast<VectorType>(V->getType());
7063 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7064 if (Ty->isScalableTy())
7065 return false;
7066
7067 // Indexed variants of Mul exist for i16 and i32 element types only.
7068 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7069 };
7070
7071 int NumZExts = 0, NumSExts = 0;
7072 for (auto &Op : I->operands()) {
7073 // Make sure we are not already sinking this operand
7074 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7075 continue;
7076
7077 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7078 auto *Ext = cast<Instruction>(Op);
7079 auto *ExtOp = Ext->getOperand(0);
7080 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7081 Ops.push_back(&Ext->getOperandUse(0));
7082 Ops.push_back(&Op);
7083
7084 if (isa<SExtInst>(Ext)) {
7085 NumSExts++;
7086 } else {
7087 NumZExts++;
7088 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7089 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7090 I->getType()->getScalarSizeInBits())
7091 NumSExts++;
7092 }
7093
7094 continue;
7095 }
7096
7098 if (!Shuffle)
7099 continue;
7100
7101 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7102 // operand and the s/zext can help create indexed s/umull. This is
7103 // especially useful to prevent i64 mul being scalarized.
7104 if (isSplatShuffle(Shuffle) &&
7105 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7106 Ops.push_back(&Shuffle->getOperandUse(0));
7107 Ops.push_back(&Op);
7108 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7109 NumSExts++;
7110 else
7111 NumZExts++;
7112 continue;
7113 }
7114
7115 Value *ShuffleOperand = Shuffle->getOperand(0);
7116 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7117 if (!Insert)
7118 continue;
7119
7120 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7121 if (!OperandInstr)
7122 continue;
7123
7124 ConstantInt *ElementConstant =
7125 dyn_cast<ConstantInt>(Insert->getOperand(2));
7126 // Check that the insertelement is inserting into element 0
7127 if (!ElementConstant || !ElementConstant->isZero())
7128 continue;
7129
7130 unsigned Opcode = OperandInstr->getOpcode();
7131 if (Opcode == Instruction::SExt)
7132 NumSExts++;
7133 else if (Opcode == Instruction::ZExt)
7134 NumZExts++;
7135 else {
7136 // If we find that the top bits are known 0, then we can sink and allow
7137 // the backend to generate a umull.
7138 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7139 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7140 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7141 continue;
7142 NumZExts++;
7143 }
7144
7145 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7146 // the And, just to hoist it again back to the load.
7147 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7148 Ops.push_back(&Insert->getOperandUse(1));
7149 Ops.push_back(&Shuffle->getOperandUse(0));
7150 Ops.push_back(&Op);
7151 }
7152
7153 // It is profitable to sink if we found two of the same type of extends.
7154 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7155 return true;
7156
7157 // Otherwise, see if we should sink splats for indexed variants.
7158 if (!ShouldSinkSplatForIndexedVariant(I))
7159 return false;
7160
7161 Ops.clear();
7162 if (isSplatShuffle(I->getOperand(0)))
7163 Ops.push_back(&I->getOperandUse(0));
7164 if (isSplatShuffle(I->getOperand(1)))
7165 Ops.push_back(&I->getOperandUse(1));
7166
7167 return !Ops.empty();
7168 }
7169 case Instruction::FMul: {
7170 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7171 if (I->getType()->isScalableTy())
7172 return !Ops.empty();
7173
7174 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7175 !ST->hasFullFP16())
7176 return !Ops.empty();
7177
7178 // Sink splats for index lane variants
7179 if (isSplatShuffle(I->getOperand(0)))
7180 Ops.push_back(&I->getOperandUse(0));
7181 if (isSplatShuffle(I->getOperand(1)))
7182 Ops.push_back(&I->getOperandUse(1));
7183 return !Ops.empty();
7184 }
7185 default:
7186 return false;
7187 }
7188 return false;
7189}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1083
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:70
bool noInfs() const
Definition FMF.h:69
bool approxFunc() const
Definition FMF.h:73
bool allowContract() const
Definition FMF.h:72
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2620
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1148
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2608
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:619
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:604
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2003
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2317
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2532
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1754
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2235
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1913
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2642
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1926
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2308
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2847
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool isFixedLengthVector() const
Definition ValueTypes.h:189
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:182
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...