LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
280 const Function *Callee) const {
281 SMECallAttrs CallAttrs(*Caller, *Callee);
282
283 // Never inline a function explicitly marked as being streaming,
284 // into a non-streaming function. Assume it was marked as streaming
285 // for a reason.
286 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
288 return false;
289
290 // When inlining, we should consider the body of the function, not the
291 // interface.
292 if (CallAttrs.callee().hasStreamingBody()) {
293 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
294 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
295 }
296
297 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
298 return false;
299
300 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
301 CallAttrs.requiresPreservingZT0() ||
302 CallAttrs.requiresPreservingAllZAState()) {
303 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
304 return false;
305 }
306
307 return BaseT::areInlineCompatible(Caller, Callee);
308}
309
311 const Function *Callee,
312 ArrayRef<Type *> Types) const {
313 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
314 return false;
315
316 // We need to ensure that argument promotion does not attempt to promote
317 // pointers to fixed-length vector types larger than 128 bits like
318 // <8 x float> (and pointers to aggregate types which have such fixed-length
319 // vector type members) into the values of the pointees. Such vector types
320 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
321 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
322 // types can be safely treated as 128-bit NEON types and they cannot be
323 // distinguished in IR.
324 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
325 auto FVTy = dyn_cast<FixedVectorType>(Ty);
326 return FVTy &&
327 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
328 }))
329 return false;
330
331 return true;
332}
333
334unsigned
336 unsigned DefaultCallPenalty) const {
337 // This function calculates a penalty for executing Call in F.
338 //
339 // There are two ways this function can be called:
340 // (1) F:
341 // call from F -> G (the call here is Call)
342 //
343 // For (1), Call.getCaller() == F, so it will always return a high cost if
344 // a streaming-mode change is required (thus promoting the need to inline the
345 // function)
346 //
347 // (2) F:
348 // call from F -> G (the call here is not Call)
349 // G:
350 // call from G -> H (the call here is Call)
351 //
352 // For (2), if after inlining the body of G into F the call to H requires a
353 // streaming-mode change, and the call to G from F would also require a
354 // streaming-mode change, then there is benefit to do the streaming-mode
355 // change only once and avoid inlining of G into F.
356
357 SMEAttrs FAttrs(*F);
358 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
359
360 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
361 if (F == Call.getCaller()) // (1)
362 return CallPenaltyChangeSM * DefaultCallPenalty;
363 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
364 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
365 }
366
367 return DefaultCallPenalty;
368}
369
373
374 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
375 return true;
376
378 ST->isSVEorStreamingSVEAvailable() &&
379 !ST->disableMaximizeScalableBandwidth();
380}
381
382/// Calculate the cost of materializing a 64-bit value. This helper
383/// method might only calculate a fraction of a larger immediate. Therefore it
384/// is valid to return a cost of ZERO.
386 // Check if the immediate can be encoded within an instruction.
387 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
388 return 0;
389
390 if (Val < 0)
391 Val = ~Val;
392
393 // Calculate how many moves we will need to materialize this constant.
395 AArch64_IMM::expandMOVImm(Val, 64, Insn);
396 return Insn.size();
397}
398
399/// Calculate the cost of materializing the given constant.
403 assert(Ty->isIntegerTy());
404
405 unsigned BitSize = Ty->getPrimitiveSizeInBits();
406 if (BitSize == 0)
407 return ~0U;
408
409 // Sign-extend all constants to a multiple of 64-bit.
410 APInt ImmVal = Imm;
411 if (BitSize & 0x3f)
412 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
413
414 // Split the constant into 64-bit chunks and calculate the cost for each
415 // chunk.
417 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
418 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
419 int64_t Val = Tmp.getSExtValue();
420 Cost += getIntImmCost(Val);
421 }
422 // We need at least one instruction to materialze the constant.
423 return std::max<InstructionCost>(1, Cost);
424}
425
427 const APInt &Imm, Type *Ty,
429 Instruction *Inst) const {
430 assert(Ty->isIntegerTy());
431
432 unsigned BitSize = Ty->getPrimitiveSizeInBits();
433 // There is no cost model for constants with a bit size of 0. Return TCC_Free
434 // here, so that constant hoisting will ignore this constant.
435 if (BitSize == 0)
436 return TTI::TCC_Free;
437
438 unsigned ImmIdx = ~0U;
439 switch (Opcode) {
440 default:
441 return TTI::TCC_Free;
442 case Instruction::GetElementPtr:
443 // Always hoist the base address of a GetElementPtr.
444 if (Idx == 0)
445 return 2 * TTI::TCC_Basic;
446 return TTI::TCC_Free;
447 case Instruction::Store:
448 ImmIdx = 0;
449 break;
450 case Instruction::Add:
451 case Instruction::Sub:
452 case Instruction::Mul:
453 case Instruction::UDiv:
454 case Instruction::SDiv:
455 case Instruction::URem:
456 case Instruction::SRem:
457 case Instruction::And:
458 case Instruction::Or:
459 case Instruction::Xor:
460 case Instruction::ICmp:
461 ImmIdx = 1;
462 break;
463 // Always return TCC_Free for the shift value of a shift instruction.
464 case Instruction::Shl:
465 case Instruction::LShr:
466 case Instruction::AShr:
467 if (Idx == 1)
468 return TTI::TCC_Free;
469 break;
470 case Instruction::Trunc:
471 case Instruction::ZExt:
472 case Instruction::SExt:
473 case Instruction::IntToPtr:
474 case Instruction::PtrToInt:
475 case Instruction::BitCast:
476 case Instruction::PHI:
477 case Instruction::Call:
478 case Instruction::Select:
479 case Instruction::Ret:
480 case Instruction::Load:
481 break;
482 }
483
484 if (Idx == ImmIdx) {
485 int NumConstants = (BitSize + 63) / 64;
487 return (Cost <= NumConstants * TTI::TCC_Basic)
488 ? static_cast<int>(TTI::TCC_Free)
489 : Cost;
490 }
492}
493
496 const APInt &Imm, Type *Ty,
498 assert(Ty->isIntegerTy());
499
500 unsigned BitSize = Ty->getPrimitiveSizeInBits();
501 // There is no cost model for constants with a bit size of 0. Return TCC_Free
502 // here, so that constant hoisting will ignore this constant.
503 if (BitSize == 0)
504 return TTI::TCC_Free;
505
506 // Most (all?) AArch64 intrinsics do not support folding immediates into the
507 // selected instruction, so we compute the materialization cost for the
508 // immediate directly.
509 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
511
512 switch (IID) {
513 default:
514 return TTI::TCC_Free;
515 case Intrinsic::sadd_with_overflow:
516 case Intrinsic::uadd_with_overflow:
517 case Intrinsic::ssub_with_overflow:
518 case Intrinsic::usub_with_overflow:
519 case Intrinsic::smul_with_overflow:
520 case Intrinsic::umul_with_overflow:
521 if (Idx == 1) {
522 int NumConstants = (BitSize + 63) / 64;
524 return (Cost <= NumConstants * TTI::TCC_Basic)
525 ? static_cast<int>(TTI::TCC_Free)
526 : Cost;
527 }
528 break;
529 case Intrinsic::experimental_stackmap:
530 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
531 return TTI::TCC_Free;
532 break;
533 case Intrinsic::experimental_patchpoint_void:
534 case Intrinsic::experimental_patchpoint:
535 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 case Intrinsic::experimental_gc_statepoint:
539 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
540 return TTI::TCC_Free;
541 break;
542 }
544}
545
547AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
548 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
549 if (TyWidth == 32 || TyWidth == 64)
551 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
552 return TTI::PSK_Software;
553}
554
556 // MispredictPenalty is defined per-CPU in AArch64Sched*.td (e.g.,
557 // AArch64SchedNeoverseV2.td).
558 return ST->getSchedModel().MispredictPenalty;
559}
560
561static bool isUnpackedVectorVT(EVT VecVT) {
562 return VecVT.isScalableVector() &&
564}
565
567 const IntrinsicCostAttributes &ICA) {
568 // We need to know at least the number of elements in the vector of buckets
569 // and the size of each element to update.
570 if (ICA.getArgTypes().size() < 2)
572
573 // Only interested in costing for the hardware instruction from SVE2.
574 if (!ST->hasSVE2())
576
577 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
578 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
579 unsigned TotalHistCnts = 1;
580
581 unsigned EltSize = EltTy->getScalarSizeInBits();
582 // Only allow (up to 64b) integers or pointers
583 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
585
586 // FIXME: We should be able to generate histcnt for fixed-length vectors
587 // using ptrue with a specific VL.
588 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
589 unsigned EC = VTy->getElementCount().getKnownMinValue();
590 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
592
593 // HistCnt only supports 32b and 64b element types
594 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
595
596 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
598
599 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
600 TotalHistCnts = EC / NaturalVectorWidth;
601
602 return InstructionCost(BaseHistCntCost * TotalHistCnts);
603 }
604
606}
607
611 // The code-generator is currently not able to handle scalable vectors
612 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
613 // it. This change will be removed when code-generation for these types is
614 // sufficiently reliable.
615 auto *RetTy = ICA.getReturnType();
616 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
617 if (VTy->getElementCount() == ElementCount::getScalable(1))
619
620 switch (ICA.getID()) {
621 case Intrinsic::experimental_vector_histogram_add: {
622 InstructionCost HistCost = getHistogramCost(ST, ICA);
623 // If the cost isn't valid, we may still be able to scalarize
624 if (HistCost.isValid())
625 return HistCost;
626 break;
627 }
628 case Intrinsic::clmul: {
629 auto LT = getTypeLegalizationCost(RetTy);
630
631 // PMUL v8i8/v16i8 is always available on AArch64
632 if (ST->hasNEON()) {
633 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
634 return LT.first;
635
636 // Scalar i8 lowers through scalar/vector moves around PMUL.
637 if (TLI->getValueType(DL, RetTy, true) == MVT::i8) {
638 auto *VecTy =
639 FixedVectorType::get(Type::getInt8Ty(RetTy->getContext()), 8);
640 return 1 +
641 getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
642 -1, nullptr, nullptr) *
643 2 +
644 getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
645 -1, nullptr, nullptr);
646 }
647 }
648
649 if (LT.second.SimpleTy == MVT::nxv2i64)
650 if (ST->hasSVEAES() && (ST->isSVEAvailable() || ST->hasSSVE_AES()))
651 return LT.first * 3;
652
653 if (ST->hasSVE2() || ST->hasSME()) {
654 switch (LT.second.SimpleTy) {
655 case MVT::nxv16i8:
656 return LT.first;
657 case MVT::nxv8i16:
658 return LT.first * 6;
659 case MVT::nxv4i32:
660 return LT.first * 3;
661 case MVT::nxv2i64:
662 return LT.first * 8;
663 default:
664 break;
665 }
666 }
667
668 // Avoid +sve giving this cost 2 due to custom lowering: It's very slow
669 if (LT.second.SimpleTy == MVT::nxv2i64)
670 return 192;
671
672 if (ST->hasAES()) {
673 switch (LT.second.SimpleTy) {
674 case MVT::i16:
675 case MVT::i32:
676 case MVT::i64:
677 case MVT::i128: {
678 auto *VecTy =
679 FixedVectorType::get(Type::getInt64Ty(RetTy->getContext()), 1);
680 return LT.first *
681 (1 +
682 getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
683 -1, nullptr, nullptr) *
684 2 +
685 getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
686 -1, nullptr, nullptr));
687 }
688 case MVT::v1i64:
689 return LT.first;
690 case MVT::v2i64:
691 return LT.first * 3;
692 case MVT::v2i32:
693 return LT.first * 6;
694 case MVT::v4i32:
695 return LT.first * 11;
696 case MVT::v4i16:
697 return LT.first * 14;
698 default:
699 break;
700 }
701 }
702 break;
703 }
704 case Intrinsic::umin:
705 case Intrinsic::umax:
706 case Intrinsic::smin:
707 case Intrinsic::smax: {
708 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
709 MVT::v8i16, MVT::v2i32, MVT::v4i32,
710 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
711 MVT::nxv2i64};
712 auto LT = getTypeLegalizationCost(RetTy);
713 // v2i64 types get converted to cmp+bif hence the cost of 2
714 if (LT.second == MVT::v2i64)
715 return LT.first * 2;
716 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
717 return LT.first;
718 break;
719 }
720 case Intrinsic::scmp:
721 case Intrinsic::ucmp: {
722 static const CostTblEntry BitreverseTbl[] = {
723 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
724 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
725 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
726 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
727 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
728 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
729 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
730 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
731 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
732 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
733 };
734 const auto LT = getTypeLegalizationCost(RetTy);
735 const auto *Entry =
736 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
737 if (Entry)
738 return Entry->Cost * LT.first;
739 break;
740 }
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 case Intrinsic::uadd_sat:
744 case Intrinsic::usub_sat: {
745 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
746 MVT::v8i16, MVT::v2i32, MVT::v4i32,
747 MVT::v2i64};
748 auto LT = getTypeLegalizationCost(RetTy);
749 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
750 // need to extend the type, as it uses shr(qadd(shl, shl)).
751 unsigned Instrs =
752 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
753 if (any_of(ValidSatTys, equal_to(LT.second)))
754 return LT.first * Instrs;
755
757 uint64_t VectorSize = TS.getKnownMinValue();
758
759 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
760 return LT.first * Instrs;
761
762 break;
763 }
764 case Intrinsic::abs: {
765 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
766 MVT::v8i16, MVT::v2i32, MVT::v4i32,
767 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
768 MVT::nxv4i32, MVT::nxv2i64};
769 auto LT = getTypeLegalizationCost(RetTy);
770 if (any_of(ValidAbsTys, equal_to(LT.second)))
771 return LT.first;
772 break;
773 }
774 case Intrinsic::bswap: {
775 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
776 MVT::v4i32, MVT::v2i64};
777 auto LT = getTypeLegalizationCost(RetTy);
778 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
779 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
780 return LT.first;
781 break;
782 }
783 case Intrinsic::fma:
784 case Intrinsic::fmuladd: {
785 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
786 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
787 Type *EltTy = RetTy->getScalarType();
788 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
789 (EltTy->isHalfTy() && ST->hasFullFP16()))
790 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
791 break;
792 }
793 case Intrinsic::stepvector: {
794 InstructionCost Cost = 1; // Cost of the `index' instruction
795 auto LT = getTypeLegalizationCost(RetTy);
796 // Legalisation of illegal vectors involves an `index' instruction plus
797 // (LT.first - 1) vector adds.
798 if (LT.first > 1) {
799 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
800 InstructionCost AddCost =
801 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
802 Cost += AddCost * (LT.first - 1);
803 }
804 return Cost;
805 }
806 case Intrinsic::vector_extract:
807 case Intrinsic::vector_insert: {
808 // If both the vector and subvector types are legal types and the index
809 // is 0, then this should be a no-op or simple operation; return a
810 // relatively low cost.
811
812 // If arguments aren't actually supplied, then we cannot determine the
813 // value of the index. We also want to skip predicate types.
814 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
816 break;
817
818 LLVMContext &C = RetTy->getContext();
819 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
820 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
821 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
822 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
823 // Skip this if either the vector or subvector types are unpacked
824 // SVE types; they may get lowered to stack stores and loads.
825 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
826 break;
827
829 getTLI()->getTypeConversion(C, SubVecVT);
831 getTLI()->getTypeConversion(C, VecVT);
832 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
833 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
834 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
835 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
836 return TTI::TCC_Free;
837 break;
838 }
839 case Intrinsic::bitreverse: {
840 static const CostTblEntry BitreverseTbl[] = {
841 {Intrinsic::bitreverse, MVT::i32, 1},
842 {Intrinsic::bitreverse, MVT::i64, 1},
843 {Intrinsic::bitreverse, MVT::v8i8, 1},
844 {Intrinsic::bitreverse, MVT::v16i8, 1},
845 {Intrinsic::bitreverse, MVT::v4i16, 2},
846 {Intrinsic::bitreverse, MVT::v8i16, 2},
847 {Intrinsic::bitreverse, MVT::v2i32, 2},
848 {Intrinsic::bitreverse, MVT::v4i32, 2},
849 {Intrinsic::bitreverse, MVT::v1i64, 2},
850 {Intrinsic::bitreverse, MVT::v2i64, 2},
851 };
852 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
853 const auto *Entry =
854 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
855 if (Entry) {
856 // Cost Model is using the legal type(i32) that i8 and i16 will be
857 // converted to +1 so that we match the actual lowering cost
858 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
859 TLI->getValueType(DL, RetTy, true) == MVT::i16)
860 return LegalisationCost.first * Entry->Cost + 1;
861
862 return LegalisationCost.first * Entry->Cost;
863 }
864 break;
865 }
866 case Intrinsic::ctpop: {
867 if (!ST->hasNEON()) {
868 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
869 return getTypeLegalizationCost(RetTy).first * 12;
870 }
871 static const CostTblEntry CtpopCostTbl[] = {
872 {ISD::CTPOP, MVT::v2i64, 4},
873 {ISD::CTPOP, MVT::v4i32, 3},
874 {ISD::CTPOP, MVT::v8i16, 2},
875 {ISD::CTPOP, MVT::v16i8, 1},
876 {ISD::CTPOP, MVT::i64, 4},
877 {ISD::CTPOP, MVT::v2i32, 3},
878 {ISD::CTPOP, MVT::v4i16, 2},
879 {ISD::CTPOP, MVT::v8i8, 1},
880 {ISD::CTPOP, MVT::i32, 5},
881 // SVE types (For targets that override NEON for fixed length vectors)
882 {ISD::CTPOP, MVT::nxv2i64, 1},
883 {ISD::CTPOP, MVT::nxv4i32, 1},
884 {ISD::CTPOP, MVT::nxv8i16, 1},
885 {ISD::CTPOP, MVT::nxv16i8, 1},
886 };
887 auto LT = getTypeLegalizationCost(RetTy);
888 MVT MTy = LT.second;
889
890 // When SVE is available CNT will be used for fixed and scalable vectors.
891 if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector())
893 128 / MTy.getScalarSizeInBits());
894
895 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
896 // Extra cost of +1 when illegal vector types are legalized by promoting
897 // the integer type.
898 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
899 RetTy->getScalarSizeInBits()
900 ? 1
901 : 0;
902 return LT.first * Entry->Cost + ExtraCost;
903 }
904 break;
905 }
906 case Intrinsic::sadd_with_overflow:
907 case Intrinsic::uadd_with_overflow:
908 case Intrinsic::ssub_with_overflow:
909 case Intrinsic::usub_with_overflow:
910 case Intrinsic::smul_with_overflow:
911 case Intrinsic::umul_with_overflow: {
912 static const CostTblEntry WithOverflowCostTbl[] = {
913 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
914 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
915 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
916 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
917 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
918 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
919 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
920 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
921 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
922 {Intrinsic::usub_with_overflow, MVT::i8, 3},
923 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
924 {Intrinsic::usub_with_overflow, MVT::i16, 3},
925 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
926 {Intrinsic::usub_with_overflow, MVT::i32, 1},
927 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
928 {Intrinsic::usub_with_overflow, MVT::i64, 1},
929 {Intrinsic::smul_with_overflow, MVT::i8, 5},
930 {Intrinsic::umul_with_overflow, MVT::i8, 4},
931 {Intrinsic::smul_with_overflow, MVT::i16, 5},
932 {Intrinsic::umul_with_overflow, MVT::i16, 4},
933 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
934 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
935 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
936 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
937 };
938 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
939 if (MTy.isSimple())
940 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
941 MTy.getSimpleVT()))
942 return Entry->Cost;
943 break;
944 }
945 case Intrinsic::fptosi_sat:
946 case Intrinsic::fptoui_sat: {
947 if (ICA.getArgTypes().empty())
948 break;
949 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
950 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
951 EVT MTy = TLI->getValueType(DL, RetTy);
952 // Check for the legal types, which are where the size of the input and the
953 // output are the same, or we are using cvt f64->i32 or f32->i64.
954 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
955 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
956 LT.second == MVT::v2f64)) {
957 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
958 (LT.second == MVT::f64 && MTy == MVT::i32) ||
959 (LT.second == MVT::f32 && MTy == MVT::i64)))
960 return LT.first;
961 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
962 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
963 MTy.getScalarSizeInBits() == 64)
964 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
965 }
966 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
967 // f32.
968 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
969 return LT.first + getIntrinsicInstrCost(
970 {ICA.getID(),
971 RetTy,
972 {ICA.getArgTypes()[0]->getWithNewType(
973 Type::getFloatTy(RetTy->getContext()))}},
974 CostKind);
975 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
976 (LT.second == MVT::f16 && MTy == MVT::i64) ||
977 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
978 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
979 return LT.first;
980 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
981 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
982 MTy.getScalarSizeInBits() == 32)
983 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
984 // Extending vector types v8f16->v8i32. These current scalarize but the
985 // codegen could be better.
986 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
987 MTy.getScalarSizeInBits() == 64)
988 return MTy.getVectorNumElements() * 3;
989
990 // If we can we use a legal convert followed by a min+max
991 if ((LT.second.getScalarType() == MVT::f32 ||
992 LT.second.getScalarType() == MVT::f64 ||
993 LT.second.getScalarType() == MVT::f16) &&
994 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
995 Type *LegalTy =
996 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
997 if (LT.second.isVector())
998 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
1000 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1001 : Intrinsic::umin,
1002 LegalTy, {LegalTy, LegalTy});
1004 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1005 : Intrinsic::umax,
1006 LegalTy, {LegalTy, LegalTy});
1008 return LT.first * Cost +
1009 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
1010 : 1);
1011 }
1012 // Otherwise we need to follow the default expansion that clamps the value
1013 // using a float min/max with a fcmp+sel for nan handling when signed.
1014 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
1015 RetTy = RetTy->getScalarType();
1016 if (LT.second.isVector()) {
1017 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
1018 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
1019 }
1020 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
1022 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
1024 Cost +=
1025 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1026 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
1027 if (IsSigned) {
1028 Type *CondTy = RetTy->getWithNewBitWidth(1);
1029 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
1031 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1033 }
1034 return LT.first * Cost;
1035 }
1036 case Intrinsic::fshl:
1037 case Intrinsic::fshr: {
1038 if (ICA.getArgs().empty())
1039 break;
1040
1041 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
1042
1043 // ROTR / ROTL is a funnel shift with equal first and second operand. For
1044 // ROTR on integer registers (i32/i64) this can be done in a single ror
1045 // instruction. A fshl with a non-constant shift uses a neg + ror.
1046 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
1047 (RetTy->getPrimitiveSizeInBits() == 32 ||
1048 RetTy->getPrimitiveSizeInBits() == 64)) {
1049 InstructionCost NegCost =
1050 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
1051 return 1 + NegCost;
1052 }
1053
1054 // TODO: Add handling for fshl where third argument is not a constant.
1055 if (!OpInfoZ.isConstant())
1056 break;
1057
1058 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
1059 if (OpInfoZ.isUniform()) {
1060 static const CostTblEntry FshlTbl[] = {
1061 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
1062 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
1063 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
1064 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
1065 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
1066 // to avoid having to duplicate the costs.
1067 const auto *Entry =
1068 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
1069 if (Entry)
1070 return LegalisationCost.first * Entry->Cost;
1071 }
1072
1073 auto TyL = getTypeLegalizationCost(RetTy);
1074 if (!RetTy->isIntegerTy())
1075 break;
1076
1077 // Estimate cost manually, as types like i8 and i16 will get promoted to
1078 // i32 and CostTableLookup will ignore the extra conversion cost.
1079 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1080 RetTy->getScalarSizeInBits() < 64) ||
1081 (RetTy->getScalarSizeInBits() % 64 != 0);
1082 unsigned ExtraCost = HigherCost ? 1 : 0;
1083 if (RetTy->getScalarSizeInBits() == 32 ||
1084 RetTy->getScalarSizeInBits() == 64)
1085 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1086 // extr instruction.
1087 else if (HigherCost)
1088 ExtraCost = 1;
1089 else
1090 break;
1091 return TyL.first + ExtraCost;
1092 }
1093 case Intrinsic::get_active_lane_mask: {
1094 auto RetTy = cast<VectorType>(ICA.getReturnType());
1095 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1096 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1097 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1098 break;
1099
1100 if (RetTy->isScalableTy()) {
1101 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1103 break;
1104
1105 auto LT = getTypeLegalizationCost(RetTy);
1106 InstructionCost Cost = LT.first;
1107 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1108 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1109 // nxv32i1 = get_active_lane_mask(base, idx) ->
1110 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1111 if (ST->hasSVE2p1() || ST->hasSME2()) {
1112 Cost /= 2;
1113 if (Cost == 1)
1114 return Cost;
1115 }
1116
1117 // If more than one whilelo intrinsic is required, include the extra cost
1118 // required by the saturating add & select required to increment the
1119 // start value after the first intrinsic call.
1120 Type *OpTy = ICA.getArgTypes()[0];
1121 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1122 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1123 Type *CondTy = OpTy->getWithNewBitWidth(1);
1124 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1126 return Cost + (SplitCost * (Cost - 1));
1127 } else if (!getTLI()->isTypeLegal(RetVT)) {
1128 // We don't have enough context at this point to determine if the mask
1129 // is going to be kept live after the block, which will force the vXi1
1130 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1131 // For now, we just assume the vectorizer created this intrinsic and
1132 // the result will be the input for a PHI. In this case the cost will
1133 // be extremely high for fixed-width vectors.
1134 // NOTE: getScalarizationOverhead returns a cost that's far too
1135 // pessimistic for the actual generated codegen. In reality there are
1136 // two instructions generated per lane.
1137 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1138 }
1139 break;
1140 }
1141 case Intrinsic::experimental_vector_match: {
1142 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1143 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1144 unsigned SearchSize = NeedleTy->getNumElements();
1145 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1146 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1147 // Neoverse V3, these are cheap operations with the same latency as a
1148 // vector ADD. In most cases, however, we also need to do an extra DUP.
1149 // For fixed-length vectors we currently need an extra five--six
1150 // instructions besides the MATCH.
1152 if (isa<FixedVectorType>(RetTy))
1153 Cost += 10;
1154 return Cost;
1155 }
1156 break;
1157 }
1158 case Intrinsic::cttz: {
1159 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1160 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1161 return LT.first * 2;
1162 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1163 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1164 return LT.first * 3;
1165 break;
1166 }
1167 case Intrinsic::experimental_cttz_elts: {
1168 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1169 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1170 // This will consist of a SVE brkb and a cntp instruction. These
1171 // typically have the same latency and half the throughput as a vector
1172 // add instruction.
1173 return 4;
1174 }
1175 break;
1176 }
1177 case Intrinsic::loop_dependence_raw_mask:
1178 case Intrinsic::loop_dependence_war_mask: {
1179 // The whilewr/rw instructions require SVE2 or SME.
1180 if (ST->hasSVE2() || ST->hasSME()) {
1181 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1182 unsigned EltSizeInBytes =
1183 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1184 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1185 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1186 break;
1187 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1188 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1189 }
1190 break;
1191 }
1192 case Intrinsic::experimental_vector_extract_last_active:
1193 if (ST->isSVEorStreamingSVEAvailable()) {
1194 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1195 // This should turn into chained clastb instructions.
1196 return LegalCost;
1197 }
1198 break;
1199 case Intrinsic::pow: {
1200 // For scalar calls we know the target has the libcall, and for fixed-width
1201 // vectors we know for the worst case it can be scalarised.
1202 EVT VT = getTLI()->getValueType(DL, RetTy);
1203 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1204 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1205 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1206
1207 // If we know that the call can be lowered with libcalls then it's safe to
1208 // reduce the costs in some cases. This is important for scalable vectors,
1209 // since we cannot scalarize the call in the absence of a vector math
1210 // library.
1211 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1212 // If we know the fast math flags and the exponent is a constant then the
1213 // cost may be less for some exponents like 0.25 and 0.75.
1214 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1215 if (ExpC && isa<VectorType>(ExpC->getType()))
1216 ExpC = ExpC->getSplatValue();
1217 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1218 // The argument must be a FP constant.
1219 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1220 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1221 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1222 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1223 (!Is025 || FMF.noSignedZeros())) {
1224 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1226 if (Is025)
1227 return 2 * Sqrt;
1229 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1230 return (Sqrt * 2) + FMul;
1231 }
1232 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1233 // cheaper than pow.
1234 }
1235 }
1236
1237 if (HasLibcall)
1238 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1239 break;
1240 }
1241 case Intrinsic::sqrt:
1242 case Intrinsic::fabs:
1243 case Intrinsic::ceil:
1244 case Intrinsic::floor:
1245 case Intrinsic::nearbyint:
1246 case Intrinsic::round:
1247 case Intrinsic::rint:
1248 case Intrinsic::roundeven:
1249 case Intrinsic::trunc:
1250 case Intrinsic::minnum:
1251 case Intrinsic::maxnum:
1252 case Intrinsic::minimum:
1253 case Intrinsic::maximum: {
1254 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1255 auto LT = getTypeLegalizationCost(RetTy);
1256 return LT.first;
1257 }
1258 break;
1259 }
1260 default:
1261 break;
1262 }
1264}
1265
1266/// The function will remove redundant reinterprets casting in the presence
1267/// of the control flow
1268static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1269 IntrinsicInst &II) {
1271 auto RequiredType = II.getType();
1272
1273 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1274 assert(PN && "Expected Phi Node!");
1275
1276 // Don't create a new Phi unless we can remove the old one.
1277 if (!PN->hasOneUse())
1278 return std::nullopt;
1279
1280 for (Value *IncValPhi : PN->incoming_values()) {
1281 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1282 if (!Reinterpret ||
1283 Reinterpret->getIntrinsicID() !=
1284 Intrinsic::aarch64_sve_convert_to_svbool ||
1285 RequiredType != Reinterpret->getArgOperand(0)->getType())
1286 return std::nullopt;
1287 }
1288
1289 // Create the new Phi
1290 IC.Builder.SetInsertPoint(PN);
1291 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1292 Worklist.push_back(PN);
1293
1294 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1295 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1296 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1297 Worklist.push_back(Reinterpret);
1298 }
1299
1300 // Cleanup Phi Node and reinterprets
1301 return IC.replaceInstUsesWith(II, NPN);
1302}
1303
1304// A collection of properties common to SVE intrinsics that allow for combines
1305// to be written without needing to know the specific intrinsic.
1307 //
1308 // Helper routines for common intrinsic definitions.
1309 //
1310
1311 // e.g. llvm.aarch64.sve.add pg, op1, op2
1312 // with IID ==> llvm.aarch64.sve.add_u
1313 static SVEIntrinsicInfo
1320
1321 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1328
1329 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1335
1336 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1342
1343 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1344 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1345 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1346 return SVEIntrinsicInfo()
1349 }
1350
1351 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1352 // llvm.aarch64.sve.ld1 pg, ptr
1359
1360 // All properties relate to predication and thus having a general predicate
1361 // is the minimum requirement to say there is intrinsic info to act on.
1362 explicit operator bool() const { return hasGoverningPredicate(); }
1363
1364 //
1365 // Properties relating to the governing predicate.
1366 //
1367
1369 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1370 }
1371
1373 assert(hasGoverningPredicate() && "Propery not set!");
1374 return GoverningPredicateIdx;
1375 }
1376
1378 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1379 GoverningPredicateIdx = Index;
1380 return *this;
1381 }
1382
1383 //
1384 // Properties relating to operations the intrinsic could be transformed into.
1385 // NOTE: This does not mean such a transformation is always possible, but the
1386 // knowledge makes it possible to reuse existing optimisations without needing
1387 // to embed specific handling for each intrinsic. For example, instruction
1388 // simplification can be used to optimise an intrinsic's active lanes.
1389 //
1390
1392 return UndefIntrinsic != Intrinsic::not_intrinsic;
1393 }
1394
1396 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1397 return UndefIntrinsic;
1398 }
1399
1401 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1402 UndefIntrinsic = IID;
1403 return *this;
1404 }
1405
1406 bool hasMatchingIROpode() const { return IROpcode != 0; }
1407
1408 unsigned getMatchingIROpode() const {
1409 assert(hasMatchingIROpode() && "Propery not set!");
1410 return IROpcode;
1411 }
1412
1414 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1415 IROpcode = Opcode;
1416 return *this;
1417 }
1418
1419 //
1420 // Properties relating to the result of inactive lanes.
1421 //
1422
1424 return ResultLanes == InactiveLanesTakenFromOperand;
1425 }
1426
1428 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1429 return OperandIdxForInactiveLanes;
1430 }
1431
1433 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1434 ResultLanes = InactiveLanesTakenFromOperand;
1435 OperandIdxForInactiveLanes = Index;
1436 return *this;
1437 }
1438
1440 return ResultLanes == InactiveLanesAreNotDefined;
1441 }
1442
1444 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1445 ResultLanes = InactiveLanesAreNotDefined;
1446 return *this;
1447 }
1448
1450 return ResultLanes == InactiveLanesAreUnused;
1451 }
1452
1454 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1455 ResultLanes = InactiveLanesAreUnused;
1456 return *this;
1457 }
1458
1459 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1460 // inactiveLanesAreZeroed =
1461 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1462 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1463
1465 ResultIsZeroInitialized = true;
1466 return *this;
1467 }
1468
1469 //
1470 // The first operand of unary merging operations is typically only used to
1471 // set the result for inactive lanes. Knowing this allows us to deadcode the
1472 // operand when we can prove there are no inactive lanes.
1473 //
1474
1476 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1477 }
1478
1480 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1481 return OperandIdxWithNoActiveLanes;
1482 }
1483
1485 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1486 OperandIdxWithNoActiveLanes = Index;
1487 return *this;
1488 }
1489
1490private:
1491 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1492
1493 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1494 unsigned IROpcode = 0;
1495
1496 enum PredicationStyle {
1498 InactiveLanesTakenFromOperand,
1499 InactiveLanesAreNotDefined,
1500 InactiveLanesAreUnused
1501 } ResultLanes = Uninitialized;
1502
1503 bool ResultIsZeroInitialized = false;
1504 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1505 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1506};
1507
1509 // Some SVE intrinsics do not use scalable vector types, but since they are
1510 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1511 if (!isa<ScalableVectorType>(II.getType()) &&
1512 all_of(II.args(), [&](const Value *V) {
1513 return !isa<ScalableVectorType>(V->getType());
1514 }))
1515 return SVEIntrinsicInfo();
1516
1517 Intrinsic::ID IID = II.getIntrinsicID();
1518 switch (IID) {
1519 default:
1520 break;
1521 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1522 case Intrinsic::aarch64_sve_fcvt_f16f32:
1523 case Intrinsic::aarch64_sve_fcvt_f16f64:
1524 case Intrinsic::aarch64_sve_fcvt_f32f16:
1525 case Intrinsic::aarch64_sve_fcvt_f32f64:
1526 case Intrinsic::aarch64_sve_fcvt_f64f16:
1527 case Intrinsic::aarch64_sve_fcvt_f64f32:
1528 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1529 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1530 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1531 case Intrinsic::aarch64_sve_fcvtzs:
1532 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1533 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1534 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1535 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1536 case Intrinsic::aarch64_sve_fcvtzu:
1537 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1538 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1539 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1540 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1541 case Intrinsic::aarch64_sve_revb:
1542 case Intrinsic::aarch64_sve_revh:
1543 case Intrinsic::aarch64_sve_revw:
1544 case Intrinsic::aarch64_sve_revd:
1545 case Intrinsic::aarch64_sve_scvtf:
1546 case Intrinsic::aarch64_sve_scvtf_f16i32:
1547 case Intrinsic::aarch64_sve_scvtf_f16i64:
1548 case Intrinsic::aarch64_sve_scvtf_f32i64:
1549 case Intrinsic::aarch64_sve_scvtf_f64i32:
1550 case Intrinsic::aarch64_sve_ucvtf:
1551 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1552 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1553 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1554 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1556
1557 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1558 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1559 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1560 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1562
1563 case Intrinsic::aarch64_sve_fabd:
1564 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1565 case Intrinsic::aarch64_sve_fadd:
1566 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1567 .setMatchingIROpcode(Instruction::FAdd);
1568 case Intrinsic::aarch64_sve_fdiv:
1569 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1570 .setMatchingIROpcode(Instruction::FDiv);
1571 case Intrinsic::aarch64_sve_fmax:
1572 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1573 case Intrinsic::aarch64_sve_fmaxnm:
1574 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1575 case Intrinsic::aarch64_sve_fmin:
1576 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1577 case Intrinsic::aarch64_sve_fminnm:
1578 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1579 case Intrinsic::aarch64_sve_fmla:
1580 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1581 case Intrinsic::aarch64_sve_fmls:
1582 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1583 case Intrinsic::aarch64_sve_fmul:
1584 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1585 .setMatchingIROpcode(Instruction::FMul);
1586 case Intrinsic::aarch64_sve_fmulx:
1587 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1588 case Intrinsic::aarch64_sve_fnmla:
1589 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1590 case Intrinsic::aarch64_sve_fnmls:
1591 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1592 case Intrinsic::aarch64_sve_fsub:
1593 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1594 .setMatchingIROpcode(Instruction::FSub);
1595 case Intrinsic::aarch64_sve_add:
1596 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1597 .setMatchingIROpcode(Instruction::Add);
1598 case Intrinsic::aarch64_sve_mla:
1599 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1600 case Intrinsic::aarch64_sve_mls:
1601 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1602 case Intrinsic::aarch64_sve_mul:
1603 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1604 .setMatchingIROpcode(Instruction::Mul);
1605 case Intrinsic::aarch64_sve_sabd:
1606 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1607 case Intrinsic::aarch64_sve_sdiv:
1608 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1609 .setMatchingIROpcode(Instruction::SDiv);
1610 case Intrinsic::aarch64_sve_smax:
1611 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1612 case Intrinsic::aarch64_sve_smin:
1613 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1614 case Intrinsic::aarch64_sve_smulh:
1615 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1616 case Intrinsic::aarch64_sve_sub:
1617 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1618 .setMatchingIROpcode(Instruction::Sub);
1619 case Intrinsic::aarch64_sve_uabd:
1620 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1621 case Intrinsic::aarch64_sve_udiv:
1622 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1623 .setMatchingIROpcode(Instruction::UDiv);
1624 case Intrinsic::aarch64_sve_umax:
1625 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1626 case Intrinsic::aarch64_sve_umin:
1627 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1628 case Intrinsic::aarch64_sve_umulh:
1629 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1630 case Intrinsic::aarch64_sve_asr:
1631 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1632 .setMatchingIROpcode(Instruction::AShr);
1633 case Intrinsic::aarch64_sve_lsl:
1634 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1635 .setMatchingIROpcode(Instruction::Shl);
1636 case Intrinsic::aarch64_sve_lsr:
1637 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1638 .setMatchingIROpcode(Instruction::LShr);
1639 case Intrinsic::aarch64_sve_and:
1640 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1641 .setMatchingIROpcode(Instruction::And);
1642 case Intrinsic::aarch64_sve_bic:
1643 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1644 case Intrinsic::aarch64_sve_eor:
1645 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1646 .setMatchingIROpcode(Instruction::Xor);
1647 case Intrinsic::aarch64_sve_orr:
1648 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1649 .setMatchingIROpcode(Instruction::Or);
1650 case Intrinsic::aarch64_sve_shsub:
1651 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1652 case Intrinsic::aarch64_sve_shsubr:
1654 case Intrinsic::aarch64_sve_sqrshl:
1655 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1656 case Intrinsic::aarch64_sve_sqshl:
1657 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1658 case Intrinsic::aarch64_sve_sqsub:
1659 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1660 case Intrinsic::aarch64_sve_srshl:
1661 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1662 case Intrinsic::aarch64_sve_uhsub:
1663 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1664 case Intrinsic::aarch64_sve_uhsubr:
1666 case Intrinsic::aarch64_sve_uqrshl:
1667 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1668 case Intrinsic::aarch64_sve_uqshl:
1669 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1670 case Intrinsic::aarch64_sve_uqsub:
1671 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1672 case Intrinsic::aarch64_sve_urshl:
1673 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1674
1675 case Intrinsic::aarch64_sve_add_u:
1677 Instruction::Add);
1678 case Intrinsic::aarch64_sve_and_u:
1680 Instruction::And);
1681 case Intrinsic::aarch64_sve_asr_u:
1683 Instruction::AShr);
1684 case Intrinsic::aarch64_sve_eor_u:
1686 Instruction::Xor);
1687 case Intrinsic::aarch64_sve_fadd_u:
1689 Instruction::FAdd);
1690 case Intrinsic::aarch64_sve_fdiv_u:
1692 Instruction::FDiv);
1693 case Intrinsic::aarch64_sve_fmul_u:
1695 Instruction::FMul);
1696 case Intrinsic::aarch64_sve_fsub_u:
1698 Instruction::FSub);
1699 case Intrinsic::aarch64_sve_lsl_u:
1701 Instruction::Shl);
1702 case Intrinsic::aarch64_sve_lsr_u:
1704 Instruction::LShr);
1705 case Intrinsic::aarch64_sve_mul_u:
1707 Instruction::Mul);
1708 case Intrinsic::aarch64_sve_orr_u:
1710 Instruction::Or);
1711 case Intrinsic::aarch64_sve_sdiv_u:
1713 Instruction::SDiv);
1714 case Intrinsic::aarch64_sve_sub_u:
1716 Instruction::Sub);
1717 case Intrinsic::aarch64_sve_udiv_u:
1719 Instruction::UDiv);
1720
1721 case Intrinsic::aarch64_sve_addqv:
1722 case Intrinsic::aarch64_sve_and_z:
1723 case Intrinsic::aarch64_sve_bic_z:
1724 case Intrinsic::aarch64_sve_brka_z:
1725 case Intrinsic::aarch64_sve_brkb_z:
1726 case Intrinsic::aarch64_sve_brkn_z:
1727 case Intrinsic::aarch64_sve_brkpa_z:
1728 case Intrinsic::aarch64_sve_brkpb_z:
1729 case Intrinsic::aarch64_sve_cntp:
1730 case Intrinsic::aarch64_sve_compact:
1731 case Intrinsic::aarch64_sve_eor_z:
1732 case Intrinsic::aarch64_sve_eorv:
1733 case Intrinsic::aarch64_sve_eorqv:
1734 case Intrinsic::aarch64_sve_nand_z:
1735 case Intrinsic::aarch64_sve_nor_z:
1736 case Intrinsic::aarch64_sve_orn_z:
1737 case Intrinsic::aarch64_sve_orr_z:
1738 case Intrinsic::aarch64_sve_orv:
1739 case Intrinsic::aarch64_sve_orqv:
1740 case Intrinsic::aarch64_sve_pnext:
1741 case Intrinsic::aarch64_sve_rdffr_z:
1742 case Intrinsic::aarch64_sve_saddv:
1743 case Intrinsic::aarch64_sve_uaddv:
1744 case Intrinsic::aarch64_sve_umaxv:
1745 case Intrinsic::aarch64_sve_umaxqv:
1746 case Intrinsic::aarch64_sve_cmpeq:
1747 case Intrinsic::aarch64_sve_cmpeq_wide:
1748 case Intrinsic::aarch64_sve_cmpge:
1749 case Intrinsic::aarch64_sve_cmpge_wide:
1750 case Intrinsic::aarch64_sve_cmpgt:
1751 case Intrinsic::aarch64_sve_cmpgt_wide:
1752 case Intrinsic::aarch64_sve_cmphi:
1753 case Intrinsic::aarch64_sve_cmphi_wide:
1754 case Intrinsic::aarch64_sve_cmphs:
1755 case Intrinsic::aarch64_sve_cmphs_wide:
1756 case Intrinsic::aarch64_sve_cmple_wide:
1757 case Intrinsic::aarch64_sve_cmplo_wide:
1758 case Intrinsic::aarch64_sve_cmpls_wide:
1759 case Intrinsic::aarch64_sve_cmplt_wide:
1760 case Intrinsic::aarch64_sve_cmpne:
1761 case Intrinsic::aarch64_sve_cmpne_wide:
1762 case Intrinsic::aarch64_sve_facge:
1763 case Intrinsic::aarch64_sve_facgt:
1764 case Intrinsic::aarch64_sve_fcmpeq:
1765 case Intrinsic::aarch64_sve_fcmpge:
1766 case Intrinsic::aarch64_sve_fcmpgt:
1767 case Intrinsic::aarch64_sve_fcmpne:
1768 case Intrinsic::aarch64_sve_fcmpuo:
1769 case Intrinsic::aarch64_sve_ld1:
1770 case Intrinsic::aarch64_sve_ld1_gather:
1771 case Intrinsic::aarch64_sve_ld1_gather_index:
1772 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1773 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1774 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1775 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1776 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1777 case Intrinsic::aarch64_sve_ld1q_gather_index:
1778 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1779 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1780 case Intrinsic::aarch64_sve_ld1ro:
1781 case Intrinsic::aarch64_sve_ld1rq:
1782 case Intrinsic::aarch64_sve_ld1udq:
1783 case Intrinsic::aarch64_sve_ld1uwq:
1784 case Intrinsic::aarch64_sve_ld2_sret:
1785 case Intrinsic::aarch64_sve_ld2q_sret:
1786 case Intrinsic::aarch64_sve_ld3_sret:
1787 case Intrinsic::aarch64_sve_ld3q_sret:
1788 case Intrinsic::aarch64_sve_ld4_sret:
1789 case Intrinsic::aarch64_sve_ld4q_sret:
1790 case Intrinsic::aarch64_sve_ldff1:
1791 case Intrinsic::aarch64_sve_ldff1_gather:
1792 case Intrinsic::aarch64_sve_ldff1_gather_index:
1793 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1794 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1795 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1796 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1797 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1798 case Intrinsic::aarch64_sve_ldnf1:
1799 case Intrinsic::aarch64_sve_ldnt1:
1800 case Intrinsic::aarch64_sve_ldnt1_gather:
1801 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1802 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1803 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1805
1806 case Intrinsic::aarch64_sve_prf:
1807 case Intrinsic::aarch64_sve_prfb_gather_index:
1808 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1809 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1810 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1811 case Intrinsic::aarch64_sve_prfd_gather_index:
1812 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1813 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1814 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1815 case Intrinsic::aarch64_sve_prfh_gather_index:
1816 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1817 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1818 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1819 case Intrinsic::aarch64_sve_prfw_gather_index:
1820 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1821 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1822 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1824
1825 case Intrinsic::aarch64_sve_st1_scatter:
1826 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1827 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1828 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1829 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1830 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1831 case Intrinsic::aarch64_sve_st1dq:
1832 case Intrinsic::aarch64_sve_st1q_scatter_index:
1833 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1834 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1835 case Intrinsic::aarch64_sve_st1wq:
1836 case Intrinsic::aarch64_sve_stnt1:
1837 case Intrinsic::aarch64_sve_stnt1_scatter:
1838 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1839 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1840 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1842 case Intrinsic::aarch64_sve_st2:
1843 case Intrinsic::aarch64_sve_st2q:
1845 case Intrinsic::aarch64_sve_st3:
1846 case Intrinsic::aarch64_sve_st3q:
1848 case Intrinsic::aarch64_sve_st4:
1849 case Intrinsic::aarch64_sve_st4q:
1851 }
1852
1853 return SVEIntrinsicInfo();
1854}
1855
1856static bool isAllActivePredicate(Value *Pred) {
1857 Value *UncastedPred;
1858
1859 // Look through predicate casts that only remove lanes.
1861 m_Value(UncastedPred)))) {
1862 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1863 Pred = UncastedPred;
1864
1866 m_Value(UncastedPred))))
1867 // If the predicate has the same or less lanes than the uncasted predicate
1868 // then we know the casting has no effect.
1869 if (OrigPredTy->getMinNumElements() <=
1870 cast<ScalableVectorType>(UncastedPred->getType())
1871 ->getMinNumElements())
1872 Pred = UncastedPred;
1873 }
1874
1875 auto *C = dyn_cast<Constant>(Pred);
1876 return C && C->isAllOnesValue();
1877}
1878
1879// Simplify `V` by only considering the operations that affect active lanes.
1880// This function should only return existing Values or newly created Constants.
1881static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1882 auto *Dup = dyn_cast<IntrinsicInst>(V);
1883 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1884 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1886 cast<VectorType>(V->getType())->getElementCount(),
1887 cast<Constant>(Dup->getOperand(2)));
1888
1889 return V;
1890}
1891
1892static std::optional<Instruction *>
1894 const SVEIntrinsicInfo &IInfo) {
1895 const unsigned Opc = IInfo.getMatchingIROpode();
1896 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1897
1898 Value *Pg = II.getOperand(0);
1899 Value *Op1 = II.getOperand(1);
1900 Value *Op2 = II.getOperand(2);
1901 const DataLayout &DL = II.getDataLayout();
1902
1903 // Canonicalise constants to the RHS.
1905 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1906 IC.replaceOperand(II, 1, Op2);
1907 IC.replaceOperand(II, 2, Op1);
1908 return &II;
1909 }
1910
1911 // Only active lanes matter when simplifying the operation.
1912 Op1 = stripInactiveLanes(Op1, Pg);
1913 Op2 = stripInactiveLanes(Op2, Pg);
1914
1915 Value *SimpleII;
1916 if (auto FII = dyn_cast<FPMathOperator>(&II))
1917 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1918 else
1919 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1920
1921 // An SVE intrinsic's result is always defined. However, this is not the case
1922 // for its equivalent IR instruction (e.g. when shifting by an amount more
1923 // than the data's bitwidth). Simplifications to an undefined result must be
1924 // ignored to preserve the intrinsic's expected behaviour.
1925 if (!SimpleII || isa<UndefValue>(SimpleII))
1926 return std::nullopt;
1927
1928 if (IInfo.inactiveLanesAreNotDefined())
1929 return IC.replaceInstUsesWith(II, SimpleII);
1930
1931 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1932
1933 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1934 if (SimpleII == Inactive)
1935 return IC.replaceInstUsesWith(II, SimpleII);
1936
1937 // Inactive lanes must be preserved.
1938 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1939 return IC.replaceInstUsesWith(II, SimpleII);
1940}
1941
1942// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1943// to operations with less strict inactive lane requirements.
1944static std::optional<Instruction *>
1946 const SVEIntrinsicInfo &IInfo) {
1947 if (!IInfo.hasGoverningPredicate())
1948 return std::nullopt;
1949
1950 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1951
1952 // If there are no active lanes.
1953 if (match(OpPredicate, m_ZeroInt())) {
1955 return IC.replaceInstUsesWith(
1956 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1957
1958 if (IInfo.inactiveLanesAreUnused()) {
1959 if (IInfo.resultIsZeroInitialized())
1961
1962 return IC.eraseInstFromFunction(II);
1963 }
1964 }
1965
1966 // If there are no inactive lanes.
1967 if (isAllActivePredicate(OpPredicate)) {
1968 if (IInfo.hasOperandWithNoActiveLanes()) {
1969 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1970 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1971 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1972 }
1973
1974 if (IInfo.hasMatchingUndefIntrinsic()) {
1975 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1976 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1977 II.setCalledFunction(NewDecl);
1978 return &II;
1979 }
1980 }
1981
1982 // Operation specific simplifications.
1983 if (IInfo.hasMatchingIROpode() &&
1985 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1986
1987 return std::nullopt;
1988}
1989
1990// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1991// => (binop (pred) (from_svbool _) (from_svbool _))
1992//
1993// The above transformation eliminates a `to_svbool` in the predicate
1994// operand of bitwise operation `binop` by narrowing the vector width of
1995// the operation. For example, it would convert a `<vscale x 16 x i1>
1996// and` into a `<vscale x 4 x i1> and`. This is profitable because
1997// to_svbool must zero the new lanes during widening, whereas
1998// from_svbool is free.
1999static std::optional<Instruction *>
2001 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
2002 if (!BinOp)
2003 return std::nullopt;
2004
2005 auto IntrinsicID = BinOp->getIntrinsicID();
2006 switch (IntrinsicID) {
2007 case Intrinsic::aarch64_sve_and_z:
2008 case Intrinsic::aarch64_sve_bic_z:
2009 case Intrinsic::aarch64_sve_eor_z:
2010 case Intrinsic::aarch64_sve_nand_z:
2011 case Intrinsic::aarch64_sve_nor_z:
2012 case Intrinsic::aarch64_sve_orn_z:
2013 case Intrinsic::aarch64_sve_orr_z:
2014 break;
2015 default:
2016 return std::nullopt;
2017 }
2018
2019 auto BinOpPred = BinOp->getOperand(0);
2020 auto BinOpOp1 = BinOp->getOperand(1);
2021 auto BinOpOp2 = BinOp->getOperand(2);
2022
2023 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
2024 if (!PredIntr ||
2025 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
2026 return std::nullopt;
2027
2028 auto PredOp = PredIntr->getOperand(0);
2029 auto PredOpTy = cast<VectorType>(PredOp->getType());
2030 if (PredOpTy != II.getType())
2031 return std::nullopt;
2032
2033 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
2034 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
2035 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
2036 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
2037 if (BinOpOp1 == BinOpOp2)
2038 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
2039 else
2040 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
2041 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
2042
2043 auto NarrowedBinOp =
2044 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
2045 return IC.replaceInstUsesWith(II, NarrowedBinOp);
2046}
2047
2048static std::optional<Instruction *>
2050 // If the reinterpret instruction operand is a PHI Node
2051 if (isa<PHINode>(II.getArgOperand(0)))
2052 return processPhiNode(IC, II);
2053
2054 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
2055 return BinOpCombine;
2056
2057 // Ignore converts to/from svcount_t.
2058 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
2059 isa<TargetExtType>(II.getType()))
2060 return std::nullopt;
2061
2062 SmallVector<Instruction *, 32> CandidatesForRemoval;
2063 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
2064
2065 const auto *IVTy = cast<VectorType>(II.getType());
2066
2067 // Walk the chain of conversions.
2068 while (Cursor) {
2069 // If the type of the cursor has fewer lanes than the final result, zeroing
2070 // must take place, which breaks the equivalence chain.
2071 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
2072 if (CursorVTy->getElementCount().getKnownMinValue() <
2073 IVTy->getElementCount().getKnownMinValue())
2074 break;
2075
2076 // If the cursor has the same type as I, it is a viable replacement.
2077 if (Cursor->getType() == IVTy)
2078 EarliestReplacement = Cursor;
2079
2080 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2081
2082 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2083 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2084 Intrinsic::aarch64_sve_convert_to_svbool ||
2085 IntrinsicCursor->getIntrinsicID() ==
2086 Intrinsic::aarch64_sve_convert_from_svbool))
2087 break;
2088
2089 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2090 Cursor = IntrinsicCursor->getOperand(0);
2091 }
2092
2093 // If no viable replacement in the conversion chain was found, there is
2094 // nothing to do.
2095 if (!EarliestReplacement)
2096 return std::nullopt;
2097
2098 return IC.replaceInstUsesWith(II, EarliestReplacement);
2099}
2100
2101static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2102 IntrinsicInst &II) {
2103 // svsel(ptrue, x, y) => x
2104 auto *OpPredicate = II.getOperand(0);
2105 if (isAllActivePredicate(OpPredicate))
2106 return IC.replaceInstUsesWith(II, II.getOperand(1));
2107
2108 auto Select =
2109 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2110 return IC.replaceInstUsesWith(II, Select);
2111}
2112
2113static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2114 IntrinsicInst &II) {
2115 Value *Pg = II.getOperand(1);
2116
2117 // sve.dup(V, all_active, X) ==> splat(X)
2118 if (isAllActivePredicate(Pg)) {
2119 auto *RetTy = cast<ScalableVectorType>(II.getType());
2120 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2121 II.getArgOperand(2));
2122 return IC.replaceInstUsesWith(II, Splat);
2123 }
2124
2126 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2127 return std::nullopt;
2128
2129 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2130 Value *Insert = IC.Builder.CreateInsertElement(
2131 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2132 return IC.replaceInstUsesWith(II, Insert);
2133}
2134
2135static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 // Replace DupX with a regular IR splat.
2138 auto *RetTy = cast<ScalableVectorType>(II.getType());
2139 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2140 II.getArgOperand(0));
2141 Splat->takeName(&II);
2142 return IC.replaceInstUsesWith(II, Splat);
2143}
2144
2145static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2146 IntrinsicInst &II) {
2147 LLVMContext &Ctx = II.getContext();
2148
2149 if (!isAllActivePredicate(II.getArgOperand(0)))
2150 return std::nullopt;
2151
2152 // Check that we have a compare of zero..
2153 auto *SplatValue =
2155 if (!SplatValue || !SplatValue->isZero())
2156 return std::nullopt;
2157
2158 // ..against a dupq
2159 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2160 if (!DupQLane ||
2161 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2162 return std::nullopt;
2163
2164 // Where the dupq is a lane 0 replicate of a vector insert
2165 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2166 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2167 return std::nullopt;
2168
2169 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2170 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2171 return std::nullopt;
2172
2173 // Where the vector insert is a fixed constant vector insert into undef at
2174 // index zero
2175 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2176 return std::nullopt;
2177
2178 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2179 return std::nullopt;
2180
2181 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2182 if (!ConstVec)
2183 return std::nullopt;
2184
2185 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2186 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2187 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2188 return std::nullopt;
2189
2190 unsigned NumElts = VecTy->getNumElements();
2191 unsigned PredicateBits = 0;
2192
2193 // Expand intrinsic operands to a 16-bit byte level predicate
2194 for (unsigned I = 0; I < NumElts; ++I) {
2195 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2196 if (!Arg)
2197 return std::nullopt;
2198 if (!Arg->isZero())
2199 PredicateBits |= 1 << (I * (16 / NumElts));
2200 }
2201
2202 // If all bits are zero bail early with an empty predicate
2203 if (PredicateBits == 0) {
2204 auto *PFalse = Constant::getNullValue(II.getType());
2205 PFalse->takeName(&II);
2206 return IC.replaceInstUsesWith(II, PFalse);
2207 }
2208
2209 // Calculate largest predicate type used (where byte predicate is largest)
2210 unsigned Mask = 8;
2211 for (unsigned I = 0; I < 16; ++I)
2212 if ((PredicateBits & (1 << I)) != 0)
2213 Mask |= (I % 8);
2214
2215 unsigned PredSize = Mask & -Mask;
2216 auto *PredType = ScalableVectorType::get(
2217 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2218
2219 // Ensure all relevant bits are set
2220 for (unsigned I = 0; I < 16; I += PredSize)
2221 if ((PredicateBits & (1 << I)) == 0)
2222 return std::nullopt;
2223
2224 auto *ConvertToSVBool =
2225 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
2226 PredType, ConstantInt::getTrue(PredType));
2227 auto *ConvertFromSVBool =
2228 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2229 II.getType(), ConvertToSVBool);
2230
2231 ConvertFromSVBool->takeName(&II);
2232 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2233}
2234
2235static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2236 IntrinsicInst &II) {
2237 Value *Pg = II.getArgOperand(0);
2238 Value *Vec = II.getArgOperand(1);
2239 auto IntrinsicID = II.getIntrinsicID();
2240 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2241
2242 // lastX(splat(X)) --> X
2243 if (auto *SplatVal = getSplatValue(Vec))
2244 return IC.replaceInstUsesWith(II, SplatVal);
2245
2246 // If x and/or y is a splat value then:
2247 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2248 Value *LHS, *RHS;
2249 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2250 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2251 auto *OldBinOp = cast<BinaryOperator>(Vec);
2252 auto OpC = OldBinOp->getOpcode();
2253 auto *NewLHS =
2254 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2255 auto *NewRHS =
2256 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2258 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2259 return IC.replaceInstUsesWith(II, NewBinOp);
2260 }
2261 }
2262
2263 auto *C = dyn_cast<Constant>(Pg);
2264 if (IsAfter && C && C->isNullValue()) {
2265 // The intrinsic is extracting lane 0 so use an extract instead.
2266 auto *IdxTy = Type::getInt64Ty(II.getContext());
2267 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2268 Extract->insertBefore(II.getIterator());
2269 Extract->takeName(&II);
2270 return IC.replaceInstUsesWith(II, Extract);
2271 }
2272
2273 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2274 if (!IntrPG)
2275 return std::nullopt;
2276
2277 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2278 return std::nullopt;
2279
2280 const auto PTruePattern =
2281 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2282
2283 // Can the intrinsic's predicate be converted to a known constant index?
2284 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2285 if (!MinNumElts)
2286 return std::nullopt;
2287
2288 unsigned Idx = MinNumElts - 1;
2289 // Increment the index if extracting the element after the last active
2290 // predicate element.
2291 if (IsAfter)
2292 ++Idx;
2293
2294 // Ignore extracts whose index is larger than the known minimum vector
2295 // length. NOTE: This is an artificial constraint where we prefer to
2296 // maintain what the user asked for until an alternative is proven faster.
2297 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2298 if (Idx >= PgVTy->getMinNumElements())
2299 return std::nullopt;
2300
2301 // The intrinsic is extracting a fixed lane so use an extract instead.
2302 auto *IdxTy = Type::getInt64Ty(II.getContext());
2303 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2304 Extract->insertBefore(II.getIterator());
2305 Extract->takeName(&II);
2306 return IC.replaceInstUsesWith(II, Extract);
2307}
2308
2309static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2310 IntrinsicInst &II) {
2311 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2312 // integer variant across a variety of micro-architectures. Replace scalar
2313 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2314 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2315 // depending on the micro-architecture, but has been observed as generally
2316 // being faster, particularly when the CLAST[AB] op is a loop-carried
2317 // dependency.
2318 Value *Pg = II.getArgOperand(0);
2319 Value *Fallback = II.getArgOperand(1);
2320 Value *Vec = II.getArgOperand(2);
2321 Type *Ty = II.getType();
2322
2323 if (!Ty->isIntegerTy())
2324 return std::nullopt;
2325
2326 Type *FPTy;
2327 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2328 default:
2329 return std::nullopt;
2330 case 16:
2331 FPTy = IC.Builder.getHalfTy();
2332 break;
2333 case 32:
2334 FPTy = IC.Builder.getFloatTy();
2335 break;
2336 case 64:
2337 FPTy = IC.Builder.getDoubleTy();
2338 break;
2339 }
2340
2341 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2342 auto *FPVTy = VectorType::get(
2343 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2344 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2345 auto *FPII = IC.Builder.CreateIntrinsic(
2346 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2347 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2348 return IC.replaceInstUsesWith(II, FPIItoInt);
2349}
2350
2351static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2352 IntrinsicInst &II) {
2353 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2354 // can work with RDFFR_PP for ptest elimination.
2355 auto *RDFFR = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z,
2356 ConstantInt::getTrue(II.getType()));
2357 RDFFR->takeName(&II);
2358 return IC.replaceInstUsesWith(II, RDFFR);
2359}
2360
2361static std::optional<Instruction *>
2363 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2364
2365 if (Pattern == AArch64SVEPredPattern::all) {
2367 II.getType(), ElementCount::getScalable(NumElts));
2368 Cnt->takeName(&II);
2369 return IC.replaceInstUsesWith(II, Cnt);
2370 }
2371
2372 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2373
2374 return MinNumElts && NumElts >= MinNumElts
2375 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2376 II, ConstantInt::get(II.getType(), MinNumElts)))
2377 : std::nullopt;
2378}
2379
2380static std::optional<Instruction *>
2382 const AArch64Subtarget *ST) {
2383 if (!ST->isStreaming())
2384 return std::nullopt;
2385
2386 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2387 // with SVEPredPattern::all
2388 Value *Cnt =
2390 Cnt->takeName(&II);
2391 return IC.replaceInstUsesWith(II, Cnt);
2392}
2393
2394static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2395 IntrinsicInst &II) {
2396 Value *PgVal = II.getArgOperand(0);
2397 Value *OpVal = II.getArgOperand(1);
2398
2399 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2400 // Later optimizations prefer this form.
2401 if (PgVal == OpVal &&
2402 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2403 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2404 Value *Ops[] = {PgVal, OpVal};
2405 Type *Tys[] = {PgVal->getType()};
2406
2407 auto *PTest =
2408 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2409 PTest->takeName(&II);
2410
2411 return IC.replaceInstUsesWith(II, PTest);
2412 }
2413
2416
2417 if (!Pg || !Op)
2418 return std::nullopt;
2419
2420 Intrinsic::ID OpIID = Op->getIntrinsicID();
2421
2422 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2423 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2424 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2425 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2426 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2427
2428 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2429
2430 PTest->takeName(&II);
2431 return IC.replaceInstUsesWith(II, PTest);
2432 }
2433
2434 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2435 // Later optimizations may rewrite sequence to use the flag-setting variant
2436 // of instruction X to remove PTEST.
2437 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2438 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2439 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2440 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2441 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2442 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2443 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2444 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2445 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2446 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2447 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2448 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2449 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2450 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2451 Type *Tys[] = {Pg->getType()};
2452
2453 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2454 PTest->takeName(&II);
2455
2456 return IC.replaceInstUsesWith(II, PTest);
2457 }
2458
2459 return std::nullopt;
2460}
2461
2462template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2463static std::optional<Instruction *>
2465 bool MergeIntoAddendOp) {
2466 Value *P = II.getOperand(0);
2467 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2468 if (MergeIntoAddendOp) {
2469 AddendOp = II.getOperand(1);
2470 Mul = II.getOperand(2);
2471 } else {
2472 AddendOp = II.getOperand(2);
2473 Mul = II.getOperand(1);
2474 }
2475
2477 m_Value(MulOp1))))
2478 return std::nullopt;
2479
2480 if (!Mul->hasOneUse())
2481 return std::nullopt;
2482
2483 Instruction *FMFSource = nullptr;
2484 if (II.getType()->isFPOrFPVectorTy()) {
2485 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2486 // Stop the combine when the flags on the inputs differ in case dropping
2487 // flags would lead to us missing out on more beneficial optimizations.
2488 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2489 return std::nullopt;
2490 if (!FAddFlags.allowContract())
2491 return std::nullopt;
2492 FMFSource = &II;
2493 }
2494
2495 Value *Res;
2496 if (MergeIntoAddendOp)
2497 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2498 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2499 else
2500 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2501 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2502
2503 return IC.replaceInstUsesWith(II, Res);
2504}
2505
2506static std::optional<Instruction *>
2508 Value *Pred = II.getOperand(0);
2509 Value *PtrOp = II.getOperand(1);
2510 Type *VecTy = II.getType();
2511
2512 if (isAllActivePredicate(Pred)) {
2513 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2514 Load->copyMetadata(II);
2515 return IC.replaceInstUsesWith(II, Load);
2516 }
2517
2518 CallInst *MaskedLoad =
2519 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2520 Pred, ConstantAggregateZero::get(VecTy));
2521 MaskedLoad->copyMetadata(II);
2522 return IC.replaceInstUsesWith(II, MaskedLoad);
2523}
2524
2525static std::optional<Instruction *>
2527 Value *VecOp = II.getOperand(0);
2528 Value *Pred = II.getOperand(1);
2529 Value *PtrOp = II.getOperand(2);
2530
2531 if (isAllActivePredicate(Pred)) {
2532 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2533 Store->copyMetadata(II);
2534 return IC.eraseInstFromFunction(II);
2535 }
2536
2537 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2538 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2539 MaskedStore->copyMetadata(II);
2540 return IC.eraseInstFromFunction(II);
2541}
2542
2544 switch (Intrinsic) {
2545 case Intrinsic::aarch64_sve_fmul_u:
2546 return Instruction::BinaryOps::FMul;
2547 case Intrinsic::aarch64_sve_fadd_u:
2548 return Instruction::BinaryOps::FAdd;
2549 case Intrinsic::aarch64_sve_fsub_u:
2550 return Instruction::BinaryOps::FSub;
2551 default:
2552 return Instruction::BinaryOpsEnd;
2553 }
2554}
2555
2556static std::optional<Instruction *>
2558 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2559 if (II.isStrictFP())
2560 return std::nullopt;
2561
2562 auto *OpPredicate = II.getOperand(0);
2563 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2564 if (BinOpCode == Instruction::BinaryOpsEnd ||
2565 !isAllActivePredicate(OpPredicate))
2566 return std::nullopt;
2567 auto BinOp = IC.Builder.CreateBinOpFMF(
2568 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2569 return IC.replaceInstUsesWith(II, BinOp);
2570}
2571
2572static std::optional<Instruction *>
2574 assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_mla_u &&
2575 "Expected MLA_U intrinsic");
2576 Value *Acc = II.getArgOperand(1);
2577 Value *MulOp0 = II.getArgOperand(2);
2578 Value *MulOp1 = II.getArgOperand(3);
2579
2580 // For mla_u, inactive lanes are undefined, so it is valid to drop the
2581 // predicate when replacing mla_u(acc, x, 1) with add(acc, x) or
2582 // mla_u(acc, x, -1) with sub(acc, x).
2583 if (match(MulOp0, m_One()))
2584 return IC.replaceInstUsesWith(II, IC.Builder.CreateAdd(Acc, MulOp1));
2585 if (match(MulOp1, m_One()))
2586 return IC.replaceInstUsesWith(II, IC.Builder.CreateAdd(Acc, MulOp0));
2587 if (match(MulOp0, m_AllOnes()))
2588 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Acc, MulOp1));
2589 if (match(MulOp1, m_AllOnes()))
2590 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Acc, MulOp0));
2591
2592 if (isa<Constant>(MulOp0) && !isa<Constant>(MulOp1)) {
2593 II.setArgOperand(2, MulOp1);
2594 II.setArgOperand(3, MulOp0);
2595 return &II;
2596 }
2597
2598 return std::nullopt;
2599}
2600
2601static std::optional<Instruction *>
2603 assert((II.getIntrinsicID() == Intrinsic::aarch64_sve_sadalp ||
2604 II.getIntrinsicID() == Intrinsic::aarch64_sve_uadalp) &&
2605 "Expected SADALP or UADALP intrinsic");
2606
2607 // Simplify add(adalp(pg, zeroinitializer, in), wide_acc)
2608 // -> adalp(pg, wide_acc, in)
2609 auto *User = dyn_cast_or_null<Instruction>(II.getUniqueUndroppableUser());
2610 if (!User || !match(II.getArgOperand(1), m_Zero()))
2611 return std::nullopt;
2612
2613 Value *Acc;
2614 if (!match(User, m_c_Add(m_Specific(&II), m_Value(Acc))))
2615 return std::nullopt;
2616
2618 Value *PairwiseAddLong = IC.Builder.CreateIntrinsic(
2619 II.getIntrinsicID(), {II.getType()},
2620 {II.getArgOperand(0), Acc, II.getArgOperand(2)});
2621
2622 IC.replaceInstUsesWith(*User, PairwiseAddLong);
2624 return &II; // II is now trivially dead and will get erased.
2625}
2626
2627static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2628 IntrinsicInst &II) {
2629 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2630 Intrinsic::aarch64_sve_mla>(
2631 IC, II, true))
2632 return MLA;
2633 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2634 Intrinsic::aarch64_sve_mad>(
2635 IC, II, false))
2636 return MAD;
2637 return std::nullopt;
2638}
2639
2640static std::optional<Instruction *>
2642 if (auto FMLA =
2643 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2644 Intrinsic::aarch64_sve_fmla>(IC, II,
2645 true))
2646 return FMLA;
2647 if (auto FMAD =
2648 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2649 Intrinsic::aarch64_sve_fmad>(IC, II,
2650 false))
2651 return FMAD;
2652 if (auto FMLA =
2653 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2654 Intrinsic::aarch64_sve_fmla>(IC, II,
2655 true))
2656 return FMLA;
2657 return std::nullopt;
2658}
2659
2660static std::optional<Instruction *>
2662 if (auto FMLA =
2663 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2664 Intrinsic::aarch64_sve_fmla>(IC, II,
2665 true))
2666 return FMLA;
2667 if (auto FMAD =
2668 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2669 Intrinsic::aarch64_sve_fmad>(IC, II,
2670 false))
2671 return FMAD;
2672 if (auto FMLA_U =
2673 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2674 Intrinsic::aarch64_sve_fmla_u>(
2675 IC, II, true))
2676 return FMLA_U;
2677 return instCombineSVEVectorBinOp(IC, II);
2678}
2679
2680static std::optional<Instruction *>
2682 if (auto FMLS =
2683 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2684 Intrinsic::aarch64_sve_fmls>(IC, II,
2685 true))
2686 return FMLS;
2687 if (auto FMSB =
2688 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2689 Intrinsic::aarch64_sve_fnmsb>(
2690 IC, II, false))
2691 return FMSB;
2692 if (auto FMLS =
2693 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2694 Intrinsic::aarch64_sve_fmls>(IC, II,
2695 true))
2696 return FMLS;
2697 return std::nullopt;
2698}
2699
2700static std::optional<Instruction *>
2702 if (auto FMLS =
2703 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2704 Intrinsic::aarch64_sve_fmls>(IC, II,
2705 true))
2706 return FMLS;
2707 if (auto FMSB =
2708 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2709 Intrinsic::aarch64_sve_fnmsb>(
2710 IC, II, false))
2711 return FMSB;
2712 if (auto FMLS_U =
2713 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2714 Intrinsic::aarch64_sve_fmls_u>(
2715 IC, II, true))
2716 return FMLS_U;
2717 return instCombineSVEVectorBinOp(IC, II);
2718}
2719
2720static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2721 IntrinsicInst &II) {
2722 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2723 Intrinsic::aarch64_sve_mls>(
2724 IC, II, true))
2725 return MLS;
2726 return std::nullopt;
2727}
2728
2729static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2730 IntrinsicInst &II) {
2731 Value *UnpackArg = II.getArgOperand(0);
2732 auto *RetTy = cast<ScalableVectorType>(II.getType());
2733 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2734 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2735
2736 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2737 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2738 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2739 ScalarArg =
2740 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2741 Value *NewVal =
2742 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2743 NewVal->takeName(&II);
2744 return IC.replaceInstUsesWith(II, NewVal);
2745 }
2746
2747 return std::nullopt;
2748}
2749static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2750 IntrinsicInst &II) {
2751 auto *OpVal = II.getOperand(0);
2752 auto *OpIndices = II.getOperand(1);
2753 VectorType *VTy = cast<VectorType>(II.getType());
2754
2755 // Check whether OpIndices is a constant splat value < minimal element count
2756 // of result.
2757 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2758 if (!SplatValue ||
2759 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2760 return std::nullopt;
2761
2762 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2763 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2764 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2765 auto *VectorSplat =
2766 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2767
2768 VectorSplat->takeName(&II);
2769 return IC.replaceInstUsesWith(II, VectorSplat);
2770}
2771
2772static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2773 IntrinsicInst &II) {
2774 Value *A, *B;
2775 Type *RetTy = II.getType();
2776 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2777 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2778
2779 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2780 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2781 if ((match(II.getArgOperand(0),
2783 match(II.getArgOperand(1),
2785 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2786 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2787 auto *TyA = cast<ScalableVectorType>(A->getType());
2788 if (TyA == B->getType() &&
2790 auto *SubVec = IC.Builder.CreateInsertVector(
2791 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2792 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2793 TyA->getMinNumElements());
2794 ConcatVec->takeName(&II);
2795 return IC.replaceInstUsesWith(II, ConcatVec);
2796 }
2797 }
2798
2799 return std::nullopt;
2800}
2801
2802static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2803 IntrinsicInst &II) {
2804 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2805 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2806 Value *A, *B;
2807 if (match(II.getArgOperand(0),
2810 m_Specific(A), m_Specific(B))))
2811 return IC.replaceInstUsesWith(
2812 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2813
2814 return std::nullopt;
2815}
2816
2817static std::optional<Instruction *>
2819 Value *Mask = II.getOperand(0);
2820 Value *BasePtr = II.getOperand(1);
2821 Value *Index = II.getOperand(2);
2822 Type *Ty = II.getType();
2823 Value *PassThru = ConstantAggregateZero::get(Ty);
2824
2825 // Contiguous gather => masked load.
2826 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2827 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2828 Value *IndexBase;
2830 m_One()))) {
2831 Align Alignment =
2832 BasePtr->getPointerAlignment(II.getDataLayout());
2833
2834 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2835 BasePtr, IndexBase);
2836 CallInst *MaskedLoad =
2837 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2838 MaskedLoad->takeName(&II);
2839 return IC.replaceInstUsesWith(II, MaskedLoad);
2840 }
2841
2842 return std::nullopt;
2843}
2844
2845static std::optional<Instruction *>
2847 Value *Val = II.getOperand(0);
2848 Value *Mask = II.getOperand(1);
2849 Value *BasePtr = II.getOperand(2);
2850 Value *Index = II.getOperand(3);
2851 Type *Ty = Val->getType();
2852
2853 // Contiguous scatter => masked store.
2854 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2855 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2856 Value *IndexBase;
2858 m_One()))) {
2859 Align Alignment =
2860 BasePtr->getPointerAlignment(II.getDataLayout());
2861
2862 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2863 BasePtr, IndexBase);
2864 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2865
2866 return IC.eraseInstFromFunction(II);
2867 }
2868
2869 return std::nullopt;
2870}
2871
2872static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2873 IntrinsicInst &II) {
2874 Type *Int32Ty = IC.Builder.getInt32Ty();
2875 Value *Pred = II.getOperand(0);
2876 Value *Vec = II.getOperand(1);
2877 Value *DivVec = II.getOperand(2);
2878
2879 Value *SplatValue = getSplatValue(DivVec);
2880 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2881 if (!SplatConstantInt)
2882 return std::nullopt;
2883
2884 APInt Divisor = SplatConstantInt->getValue();
2885 const int64_t DivisorValue = Divisor.getSExtValue();
2886 if (DivisorValue == -1)
2887 return std::nullopt;
2888 if (DivisorValue == 1)
2889 IC.replaceInstUsesWith(II, Vec);
2890
2891 if (Divisor.isPowerOf2()) {
2892 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2893 auto ASRD = IC.Builder.CreateIntrinsic(
2894 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2895 return IC.replaceInstUsesWith(II, ASRD);
2896 }
2897 if (Divisor.isNegatedPowerOf2()) {
2898 Divisor.negate();
2899 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2900 auto ASRD = IC.Builder.CreateIntrinsic(
2901 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2902 auto NEG = IC.Builder.CreateIntrinsic(
2903 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2904 return IC.replaceInstUsesWith(II, NEG);
2905 }
2906
2907 return std::nullopt;
2908}
2909
2910bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2911 size_t VecSize = Vec.size();
2912 if (VecSize == 1)
2913 return true;
2914 if (!isPowerOf2_64(VecSize))
2915 return false;
2916 size_t HalfVecSize = VecSize / 2;
2917
2918 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2919 RHS != Vec.end(); LHS++, RHS++) {
2920 if (*LHS != nullptr && *RHS != nullptr) {
2921 if (*LHS == *RHS)
2922 continue;
2923 else
2924 return false;
2925 }
2926 if (!AllowPoison)
2927 return false;
2928 if (*LHS == nullptr && *RHS != nullptr)
2929 *LHS = *RHS;
2930 }
2931
2932 Vec.resize(HalfVecSize);
2933 SimplifyValuePattern(Vec, AllowPoison);
2934 return true;
2935}
2936
2937// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2938// to dupqlane(f64(C)) where C is A concatenated with B
2939static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2940 IntrinsicInst &II) {
2941 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2942 if (!match(II.getOperand(0),
2944 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2945 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2946 return std::nullopt;
2947 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2948
2949 // Insert the scalars into a container ordered by InsertElement index
2950 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2951 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2952 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2953 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2954 CurrentInsertElt = InsertElt->getOperand(0);
2955 }
2956
2957 bool AllowPoison =
2958 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2959 if (!SimplifyValuePattern(Elts, AllowPoison))
2960 return std::nullopt;
2961
2962 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2963 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2964 for (size_t I = 0; I < Elts.size(); I++) {
2965 if (Elts[I] == nullptr)
2966 continue;
2967 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2968 IC.Builder.getInt64(I));
2969 }
2970 if (InsertEltChain == nullptr)
2971 return std::nullopt;
2972
2973 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2974 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2975 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2976 // be narrowed back to the original type.
2977 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2978 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2979 IIScalableTy->getMinNumElements() /
2980 PatternWidth;
2981
2982 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2983 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2984 auto *WideShuffleMaskTy =
2985 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2986
2987 auto InsertSubvector = IC.Builder.CreateInsertVector(
2988 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2989 uint64_t(0));
2990 auto WideBitcast =
2991 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2992 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2993 auto WideShuffle = IC.Builder.CreateShuffleVector(
2994 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2995 auto NarrowBitcast =
2996 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2997
2998 return IC.replaceInstUsesWith(II, NarrowBitcast);
2999}
3000
3001static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
3002 IntrinsicInst &II) {
3003 Value *A = II.getArgOperand(0);
3004 Value *B = II.getArgOperand(1);
3005 if (A == B)
3006 return IC.replaceInstUsesWith(II, A);
3007
3008 return std::nullopt;
3009}
3010
3011static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
3012 IntrinsicInst &II) {
3013 Value *Pred = II.getOperand(0);
3014 Value *Vec = II.getOperand(1);
3015 Value *Shift = II.getOperand(2);
3016
3017 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
3018 Value *AbsPred, *MergedValue;
3020 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
3022 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
3023
3024 return std::nullopt;
3025
3026 // Transform is valid if any of the following are true:
3027 // * The ABS merge value is an undef or non-negative
3028 // * The ABS predicate is all active
3029 // * The ABS predicate and the SRSHL predicates are the same
3030 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
3031 AbsPred != Pred && !isAllActivePredicate(AbsPred))
3032 return std::nullopt;
3033
3034 // Only valid when the shift amount is non-negative, otherwise the rounding
3035 // behaviour of SRSHL cannot be ignored.
3036 if (!match(Shift, m_NonNegative()))
3037 return std::nullopt;
3038
3039 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
3040 {II.getType()}, {Pred, Vec, Shift});
3041
3042 return IC.replaceInstUsesWith(II, LSL);
3043}
3044
3045static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
3046 IntrinsicInst &II) {
3047 Value *Vec = II.getOperand(0);
3048
3049 if (getSplatValue(Vec) == II.getOperand(1))
3050 return IC.replaceInstUsesWith(II, Vec);
3051
3052 return std::nullopt;
3053}
3054
3055static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
3056 IntrinsicInst &II) {
3057 // If this barrier is post-dominated by identical one we can remove it
3058 auto *NI = II.getNextNode();
3059 unsigned LookaheadThreshold = DMBLookaheadThreshold;
3060 auto CanSkipOver = [](Instruction *I) {
3061 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
3062 };
3063 while (LookaheadThreshold-- && CanSkipOver(NI)) {
3064 auto *NIBB = NI->getParent();
3065 NI = NI->getNextNode();
3066 if (!NI) {
3067 if (auto *SuccBB = NIBB->getUniqueSuccessor())
3068 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
3069 else
3070 break;
3071 }
3072 }
3073 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
3074 if (NextII && II.isIdenticalTo(NextII))
3075 return IC.eraseInstFromFunction(II);
3076
3077 return std::nullopt;
3078}
3079
3080static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
3081 IntrinsicInst &II) {
3082 return IC.replaceInstUsesWith(
3083 II,
3084 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
3085 {II.getType(), II.getOperand(0)->getType()},
3086 {II.getOperand(0), II.getOperand(1)}));
3087}
3088
3089static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
3090 IntrinsicInst &II) {
3091 unsigned PredPattern = cast<ConstantInt>(II.getOperand(0))->getZExtValue();
3092 // SVE vector length is a power-of-two, thus pow2 is synonymous with all.
3093 if (PredPattern == AArch64SVEPredPattern::all ||
3094 PredPattern == AArch64SVEPredPattern::pow2)
3095 return IC.replaceInstUsesWith(II, ConstantInt::getTrue(II.getType()));
3096 return std::nullopt;
3097}
3098
3099static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
3101 unsigned NumBits) {
3102 Value *Passthru = II.getOperand(0);
3103 Value *Pg = II.getOperand(1);
3104 Value *Op = II.getOperand(2);
3105
3106 // Convert UXT[BHW] to AND.
3107 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
3108 auto *Ty = cast<VectorType>(II.getType());
3109 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
3110 auto *Mask = ConstantInt::get(Ty, MaskValue);
3111 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
3112 {Pg, Op, Mask});
3113 return IC.replaceInstUsesWith(II, And);
3114 }
3115
3116 return std::nullopt;
3117}
3118
3119static std::optional<Instruction *>
3121 SMEAttrs FnSMEAttrs(*II.getFunction());
3122 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
3123 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
3124 return IC.replaceInstUsesWith(
3125 II, ConstantInt::getBool(II.getType(), IsStreaming));
3126 return std::nullopt;
3127}
3128
3129std::optional<Instruction *>
3131 IntrinsicInst &II) const {
3133 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3134 return I;
3135
3136 Intrinsic::ID IID = II.getIntrinsicID();
3137 switch (IID) {
3138 default:
3139 break;
3140 case Intrinsic::aarch64_dmb:
3141 return instCombineDMB(IC, II);
3142 case Intrinsic::aarch64_neon_fmaxnm:
3143 case Intrinsic::aarch64_neon_fminnm:
3144 return instCombineMaxMinNM(IC, II);
3145 case Intrinsic::aarch64_sve_convert_from_svbool:
3146 return instCombineConvertFromSVBool(IC, II);
3147 case Intrinsic::aarch64_sve_dup:
3148 return instCombineSVEDup(IC, II);
3149 case Intrinsic::aarch64_sve_dup_x:
3150 return instCombineSVEDupX(IC, II);
3151 case Intrinsic::aarch64_sve_cmpne:
3152 case Intrinsic::aarch64_sve_cmpne_wide:
3153 return instCombineSVECmpNE(IC, II);
3154 case Intrinsic::aarch64_sve_rdffr:
3155 return instCombineRDFFR(IC, II);
3156 case Intrinsic::aarch64_sve_lasta:
3157 case Intrinsic::aarch64_sve_lastb:
3158 return instCombineSVELast(IC, II);
3159 case Intrinsic::aarch64_sve_clasta_n:
3160 case Intrinsic::aarch64_sve_clastb_n:
3161 return instCombineSVECondLast(IC, II);
3162 case Intrinsic::aarch64_sve_cntd:
3163 return instCombineSVECntElts(IC, II, 2);
3164 case Intrinsic::aarch64_sve_cntw:
3165 return instCombineSVECntElts(IC, II, 4);
3166 case Intrinsic::aarch64_sve_cnth:
3167 return instCombineSVECntElts(IC, II, 8);
3168 case Intrinsic::aarch64_sve_cntb:
3169 return instCombineSVECntElts(IC, II, 16);
3170 case Intrinsic::aarch64_sme_cntsd:
3171 return instCombineSMECntsd(IC, II, ST);
3172 case Intrinsic::aarch64_sve_ptest_any:
3173 case Intrinsic::aarch64_sve_ptest_first:
3174 case Intrinsic::aarch64_sve_ptest_last:
3175 return instCombineSVEPTest(IC, II);
3176 case Intrinsic::aarch64_sve_fadd:
3177 return instCombineSVEVectorFAdd(IC, II);
3178 case Intrinsic::aarch64_sve_fadd_u:
3179 return instCombineSVEVectorFAddU(IC, II);
3180 case Intrinsic::aarch64_sve_fmul_u:
3181 return instCombineSVEVectorBinOp(IC, II);
3182 case Intrinsic::aarch64_sve_fsub:
3183 return instCombineSVEVectorFSub(IC, II);
3184 case Intrinsic::aarch64_sve_fsub_u:
3185 return instCombineSVEVectorFSubU(IC, II);
3186 case Intrinsic::aarch64_sve_add:
3187 return instCombineSVEVectorAdd(IC, II);
3188 case Intrinsic::aarch64_sve_add_u:
3189 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3190 Intrinsic::aarch64_sve_mla_u>(
3191 IC, II, true);
3192 case Intrinsic::aarch64_sve_mla_u:
3193 return instCombineSVEVectorMlaU(IC, II);
3194 case Intrinsic::aarch64_sve_sadalp:
3195 case Intrinsic::aarch64_sve_uadalp:
3197 case Intrinsic::aarch64_sve_sub:
3198 return instCombineSVEVectorSub(IC, II);
3199 case Intrinsic::aarch64_sve_sub_u:
3200 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3201 Intrinsic::aarch64_sve_mls_u>(
3202 IC, II, true);
3203 case Intrinsic::aarch64_sve_tbl:
3204 return instCombineSVETBL(IC, II);
3205 case Intrinsic::aarch64_sve_uunpkhi:
3206 case Intrinsic::aarch64_sve_uunpklo:
3207 case Intrinsic::aarch64_sve_sunpkhi:
3208 case Intrinsic::aarch64_sve_sunpklo:
3209 return instCombineSVEUnpack(IC, II);
3210 case Intrinsic::aarch64_sve_uzp1:
3211 return instCombineSVEUzp1(IC, II);
3212 case Intrinsic::aarch64_sve_zip1:
3213 case Intrinsic::aarch64_sve_zip2:
3214 return instCombineSVEZip(IC, II);
3215 case Intrinsic::aarch64_sve_ld1_gather_index:
3216 return instCombineLD1GatherIndex(IC, II);
3217 case Intrinsic::aarch64_sve_st1_scatter_index:
3218 return instCombineST1ScatterIndex(IC, II);
3219 case Intrinsic::aarch64_sve_ld1:
3220 return instCombineSVELD1(IC, II, DL);
3221 case Intrinsic::aarch64_sve_st1:
3222 return instCombineSVEST1(IC, II, DL);
3223 case Intrinsic::aarch64_sve_sdiv:
3224 return instCombineSVESDIV(IC, II);
3225 case Intrinsic::aarch64_sve_sel:
3226 return instCombineSVESel(IC, II);
3227 case Intrinsic::aarch64_sve_srshl:
3228 return instCombineSVESrshl(IC, II);
3229 case Intrinsic::aarch64_sve_dupq_lane:
3230 return instCombineSVEDupqLane(IC, II);
3231 case Intrinsic::aarch64_sve_insr:
3232 return instCombineSVEInsr(IC, II);
3233 case Intrinsic::aarch64_sve_whilelo:
3234 return instCombineWhilelo(IC, II);
3235 case Intrinsic::aarch64_sve_ptrue:
3236 return instCombinePTrue(IC, II);
3237 case Intrinsic::aarch64_sve_uxtb:
3238 return instCombineSVEUxt(IC, II, 8);
3239 case Intrinsic::aarch64_sve_uxth:
3240 return instCombineSVEUxt(IC, II, 16);
3241 case Intrinsic::aarch64_sve_uxtw:
3242 return instCombineSVEUxt(IC, II, 32);
3243 case Intrinsic::aarch64_sme_in_streaming_mode:
3244 return instCombineInStreamingMode(IC, II);
3245 }
3246
3247 return std::nullopt;
3248}
3249
3251 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3252 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3253 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3254 SimplifyAndSetOp) const {
3255 switch (II.getIntrinsicID()) {
3256 default:
3257 break;
3258 case Intrinsic::aarch64_neon_fcvtxn:
3259 case Intrinsic::aarch64_neon_rshrn:
3260 case Intrinsic::aarch64_neon_sqrshrn:
3261 case Intrinsic::aarch64_neon_sqrshrun:
3262 case Intrinsic::aarch64_neon_sqshrn:
3263 case Intrinsic::aarch64_neon_sqshrun:
3264 case Intrinsic::aarch64_neon_sqxtn:
3265 case Intrinsic::aarch64_neon_sqxtun:
3266 case Intrinsic::aarch64_neon_uqrshrn:
3267 case Intrinsic::aarch64_neon_uqshrn:
3268 case Intrinsic::aarch64_neon_uqxtn:
3269 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3270 break;
3271 }
3272
3273 return std::nullopt;
3274}
3275
3277 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3279}
3280
3283 switch (K) {
3285 return TypeSize::getFixed(64);
3287 if (ST->useSVEForFixedLengthVectors() &&
3288 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3289 return TypeSize::getFixed(
3290 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3291 else if (ST->isNeonAvailable())
3292 return TypeSize::getFixed(128);
3293 else
3294 return TypeSize::getFixed(0);
3296 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3298 return TypeSize::getScalable(128);
3299 else
3300 return TypeSize::getScalable(0);
3301 }
3302 llvm_unreachable("Unsupported register kind");
3303}
3304
3305bool AArch64TTIImpl::isSingleExtWideningInstruction(
3306 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3307 Type *SrcOverrideTy) const {
3308 // A helper that returns a vector type from the given type. The number of
3309 // elements in type Ty determines the vector width.
3310 auto toVectorTy = [&](Type *ArgTy) {
3311 return VectorType::get(ArgTy->getScalarType(),
3312 cast<VectorType>(DstTy)->getElementCount());
3313 };
3314
3315 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3316 // i32, i64]. SVE doesn't generally have the same set of instructions to
3317 // perform an extend with the add/sub/mul. There are SMULLB style
3318 // instructions, but they operate on top/bottom, requiring some sort of lane
3319 // interleaving to be used with zext/sext.
3320 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3321 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3322 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3323 return false;
3324
3325 Type *SrcTy = SrcOverrideTy;
3326 switch (Opcode) {
3327 case Instruction::Add: // UADDW(2), SADDW(2).
3328 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3329 // The second operand needs to be an extend
3330 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3331 if (!SrcTy)
3332 SrcTy =
3333 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3334 break;
3335 }
3336
3337 if (Opcode == Instruction::Sub)
3338 return false;
3339
3340 // UADDW(2), SADDW(2) can be commutted.
3341 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3342 if (!SrcTy)
3343 SrcTy =
3344 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3345 break;
3346 }
3347 return false;
3348 }
3349 default:
3350 return false;
3351 }
3352
3353 // Legalize the destination type and ensure it can be used in a widening
3354 // operation.
3355 auto DstTyL = getTypeLegalizationCost(DstTy);
3356 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3357 return false;
3358
3359 // Legalize the source type and ensure it can be used in a widening
3360 // operation.
3361 assert(SrcTy && "Expected some SrcTy");
3362 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3363 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3364 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3365 return false;
3366
3367 // Get the total number of vector elements in the legalized types.
3368 InstructionCost NumDstEls =
3369 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3370 InstructionCost NumSrcEls =
3371 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3372
3373 // Return true if the legalized types have the same number of vector elements
3374 // and the destination element type size is twice that of the source type.
3375 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3376}
3377
3378Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3380 Type *SrcOverrideTy) const {
3381 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3382 Opcode != Instruction::Mul)
3383 return nullptr;
3384
3385 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3386 // i32, i64]. SVE doesn't generally have the same set of instructions to
3387 // perform an extend with the add/sub/mul. There are SMULLB style
3388 // instructions, but they operate on top/bottom, requiring some sort of lane
3389 // interleaving to be used with zext/sext.
3390 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3391 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3392 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3393 return nullptr;
3394
3395 auto getScalarSizeWithOverride = [&](const Value *V) {
3396 if (SrcOverrideTy)
3397 return SrcOverrideTy->getScalarSizeInBits();
3398 return cast<Instruction>(V)
3399 ->getOperand(0)
3400 ->getType()
3401 ->getScalarSizeInBits();
3402 };
3403
3404 unsigned MaxEltSize = 0;
3405 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3406 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3407 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3408 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3409 MaxEltSize = std::max(EltSize0, EltSize1);
3410 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3411 isa<SExtInst, ZExtInst>(Args[1])) {
3412 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3413 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3414 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3415 // enough.
3416 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3417 return nullptr;
3418 MaxEltSize = DstEltSize / 2;
3419 } else if (Opcode == Instruction::Mul &&
3420 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3421 // If one of the operands is a Zext and the other has enough zero bits
3422 // to be treated as unsigned, we can still generate a umull, meaning the
3423 // zext is free.
3424 KnownBits Known =
3425 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3426 if (Args[0]->getType()->getScalarSizeInBits() -
3427 Known.Zero.countLeadingOnes() >
3428 DstTy->getScalarSizeInBits() / 2)
3429 return nullptr;
3430
3431 MaxEltSize =
3432 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3433 } else
3434 return nullptr;
3435
3436 if (MaxEltSize * 2 > DstEltSize)
3437 return nullptr;
3438
3439 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3440 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3441 return nullptr;
3442 return ExtTy;
3443}
3444
3445// s/urhadd instructions implement the following pattern, making the
3446// extends free:
3447// %x = add ((zext i8 -> i16), 1)
3448// %y = (zext i8 -> i16)
3449// trunc i16 (lshr (add %x, %y), 1) -> i8
3450//
3452 Type *Src) const {
3453 // The source should be a legal vector type.
3454 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3455 (Src->isScalableTy() && !ST->hasSVE2()))
3456 return false;
3457
3458 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3459 return false;
3460
3461 // Look for trunc/shl/add before trying to match the pattern.
3462 const Instruction *Add = ExtUser;
3463 auto *AddUser =
3464 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3465 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3466 Add = AddUser;
3467
3468 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3469 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3470 return false;
3471
3472 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3473 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3474 Src->getScalarSizeInBits() !=
3475 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3476 return false;
3477
3478 // Try to match the whole pattern. Ext could be either the first or second
3479 // m_ZExtOrSExt matched.
3480 Instruction *Ex1, *Ex2;
3481 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3482 m_c_Add(m_Instruction(Ex2), m_One())))))
3483 return false;
3484
3485 // Ensure both extends are of the same type
3486 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3487 Ex1->getOpcode() == Ex2->getOpcode())
3488 return true;
3489
3490 return false;
3491}
3492
3494 Type *Src,
3497 const Instruction *I) const {
3498 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3499 assert(ISD && "Invalid opcode");
3500 // If the cast is observable, and it is used by a widening instruction (e.g.,
3501 // uaddl, saddw, etc.), it may be free.
3502 if (I && I->hasOneUser()) {
3503 auto *SingleUser = cast<Instruction>(*I->user_begin());
3504 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3505 if (Type *ExtTy = isBinExtWideningInstruction(
3506 SingleUser->getOpcode(), Dst, Operands,
3507 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3508 // The cost from Src->Src*2 needs to be added if required, the cost from
3509 // Src*2->ExtTy is free.
3510 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3511 Type *DoubleSrcTy =
3512 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3513 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3515 }
3516
3517 return 0;
3518 }
3519
3520 if (isSingleExtWideningInstruction(
3521 SingleUser->getOpcode(), Dst, Operands,
3522 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3523 // For adds only count the second operand as free if both operands are
3524 // extends but not the same operation. (i.e both operands are not free in
3525 // add(sext, zext)).
3526 if (SingleUser->getOpcode() == Instruction::Add) {
3527 if (I == SingleUser->getOperand(1) ||
3528 (isa<CastInst>(SingleUser->getOperand(1)) &&
3529 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3530 return 0;
3531 } else {
3532 // Others are free so long as isSingleExtWideningInstruction
3533 // returned true.
3534 return 0;
3535 }
3536 }
3537
3538 // The cast will be free for the s/urhadd instructions
3539 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3540 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3541 return 0;
3542 }
3543
3544 EVT SrcTy = TLI->getValueType(DL, Src);
3545 EVT DstTy = TLI->getValueType(DL, Dst);
3546
3547 if (!SrcTy.isSimple() || !DstTy.isSimple())
3548 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3549
3550 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3551 // we use fcvtx under SVE2. Give them invalid costs.
3552 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3553 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3554 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3556
3557 static const TypeConversionCostTblEntry BF16Tbl[] = {
3558 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3559 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3560 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3561 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3562 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3563 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3564 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3565 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3566 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3567 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3568 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3569 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3570 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3571 };
3572
3573 if (ST->hasBF16())
3574 if (const auto *Entry = ConvertCostTableLookup(
3575 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3576 return Entry->Cost;
3577
3578 // We have to estimate a cost of fixed length operation upon
3579 // SVE registers(operations) with the number of registers required
3580 // for a fixed type to be represented upon SVE registers.
3581 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3582 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3583 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3584 ST->useSVEForFixedLengthVectors(WiderTy)) {
3585 std::pair<InstructionCost, MVT> LT =
3586 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3587 unsigned NumElements =
3588 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3589 return LT.first *
3591 Opcode,
3592 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3593 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3594 CostKind, I);
3595 }
3596
3597 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3598 // The cost of unpacking twice is artificially increased for now in order
3599 // to avoid regressions against NEON, which will use tbl instructions directly
3600 // instead of multiple layers of [s|u]unpk[lo|hi].
3601 // We use the unpacks in cases where the destination type is illegal and
3602 // requires splitting of the input, even if the input type itself is legal.
3603 const unsigned int SVE_EXT_COST = 1;
3604 const unsigned int SVE_FCVT_COST = 1;
3605 const unsigned int SVE_UNPACK_ONCE = 4;
3606 const unsigned int SVE_UNPACK_TWICE = 16;
3607
3608 static const TypeConversionCostTblEntry ConversionTbl[] = {
3609 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3610 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3611 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3612 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3613 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3614 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3615 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3616 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3617 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3618 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3619 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3620 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3621 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3622 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3623 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3624 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3625 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3626 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3627 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3628 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3629
3630 // Truncations on nxvmiN
3631 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3632 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3633 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3634 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3635 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3636 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3637 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3638 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3639 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3640 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3641 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3642 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3643 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3644 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3645 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3646 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3647 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3648 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3649 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3650 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3651 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3652 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3653 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3654 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3655 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3656 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3657 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3658 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3659 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3660 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3661 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3662 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3663 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3664
3665 // The number of shll instructions for the extension.
3666 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3667 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3668 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3669 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3670 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3671 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3672 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3673 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3674 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3675 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3676 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3677 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3678 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3679 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3680 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3681 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3682
3683 // FP Ext and trunc
3684 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3685 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3686 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3687 // FP16
3688 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3689 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3690 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3691 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3692 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3693 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3694 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3695 // BF16 (uses shift)
3696 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3697 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3698 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3699 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3700 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3701 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3702 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3703 // FP Ext and trunc
3704 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3705 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3706 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3707 // FP16
3708 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3709 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3710 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3711 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3712 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3713 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3714 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3715 // BF16 (more complex, with +bf16 is handled above)
3716 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3717 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3718 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3719 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3720 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3721 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3722 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3723 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3724
3725 // LowerVectorINT_TO_FP:
3726 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3727 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3728 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3729 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3730 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3731 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3732
3733 // SVE: to nxv2f16
3734 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3735 SVE_EXT_COST + SVE_FCVT_COST},
3736 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3737 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3738 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3739 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3740 SVE_EXT_COST + SVE_FCVT_COST},
3741 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3742 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3743 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3744
3745 // SVE: to nxv4f16
3746 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3747 SVE_EXT_COST + SVE_FCVT_COST},
3748 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3749 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3750 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3751 SVE_EXT_COST + SVE_FCVT_COST},
3752 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3753 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3754
3755 // SVE: to nxv8f16
3756 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3757 SVE_EXT_COST + SVE_FCVT_COST},
3758 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3759 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3760 SVE_EXT_COST + SVE_FCVT_COST},
3761 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3762
3763 // SVE: to nxv16f16
3764 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3765 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3766 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3767 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3768
3769 // Complex: to v2f32
3770 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3771 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3772 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3773 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3774
3775 // SVE: to nxv2f32
3776 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3777 SVE_EXT_COST + SVE_FCVT_COST},
3778 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3779 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3780 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3781 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3782 SVE_EXT_COST + SVE_FCVT_COST},
3783 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3784 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3785 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3786
3787 // Complex: to v4f32
3788 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3789 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3790 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3791 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3792
3793 // SVE: to nxv4f32
3794 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3795 SVE_EXT_COST + SVE_FCVT_COST},
3796 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3797 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3798 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3799 SVE_EXT_COST + SVE_FCVT_COST},
3800 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3801 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3802
3803 // Complex: to v8f32
3804 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3805 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3806 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3807 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3808
3809 // SVE: to nxv8f32
3810 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3811 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3812 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3813 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3814 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3815 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3816 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3817 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3818
3819 // SVE: to nxv16f32
3820 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3821 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3822 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3823 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3824
3825 // Complex: to v16f32
3826 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3827 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3828
3829 // Complex: to v2f64
3830 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3831 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3832 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3833 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3834 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3835 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3836
3837 // SVE: to nxv2f64
3838 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3839 SVE_EXT_COST + SVE_FCVT_COST},
3840 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3841 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3842 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3843 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3844 SVE_EXT_COST + SVE_FCVT_COST},
3845 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3846 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3847 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3848
3849 // Complex: to v4f64
3850 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3851 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3852
3853 // SVE: to nxv4f64
3854 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3855 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3856 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3857 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3858 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3859 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3860 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3861 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3862 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3863 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3864 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3865 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3866
3867 // SVE: to nxv8f64
3868 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3869 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3870 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3871 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3872 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3873 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3874 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3875 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3876
3877 // LowerVectorFP_TO_INT
3878 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3879 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3880 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3881 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3882 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3883 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3884
3885 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3886 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3887 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3888 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3889 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3890 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3891 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3892
3893 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3894 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3895 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3896 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3897 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3898
3899 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3900 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3901 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3902 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3903 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3904 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3905 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3906
3907 // Complex, from nxv2f32.
3908 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3909 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3910 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3911 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3912 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3913 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3914 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3915 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3916
3917 // Complex, from nxv2f64.
3918 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3919 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3920 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3921 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3922 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3923 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3924 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3925 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3926 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3927 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3928
3929 // Complex, from nxv4f32.
3930 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3931 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3932 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3933 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3934 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3935 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3936 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3937 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3938 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3939 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3940
3941 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3942 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3943 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3944 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3945 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3946
3947 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3948 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3949 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3950 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3951 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3952 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3953 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3954
3955 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3956 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3957 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3958 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3959 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3960
3961 // Complex, from nxv8f16.
3962 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3963 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3964 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3965 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3966 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3967 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3968 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3969 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3970 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3971 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3972
3973 // Complex, from nxv4f16.
3974 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3975 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3976 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3977 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3978 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3979 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3980 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3981 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3982
3983 // Complex, from nxv2f16.
3984 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3985 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3986 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3987 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3988 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3989 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3990 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3991 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3992
3993 // Truncate from nxvmf32 to nxvmf16.
3994 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3995 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3996 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3997
3998 // Truncate from nxvmf32 to nxvmbf16.
3999 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
4000 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
4001 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
4002
4003 // Truncate from nxvmf64 to nxvmf16.
4004 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
4005 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
4006 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
4007
4008 // Truncate from nxvmf64 to nxvmbf16.
4009 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
4010 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
4011 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
4012
4013 // Truncate from nxvmf64 to nxvmf32.
4014 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
4015 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
4016 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
4017
4018 // Extend from nxvmf16 to nxvmf32.
4019 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
4020 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
4021 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
4022
4023 // Extend from nxvmbf16 to nxvmf32.
4024 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
4025 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
4026 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
4027
4028 // Extend from nxvmf16 to nxvmf64.
4029 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
4030 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
4031 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
4032
4033 // Extend from nxvmbf16 to nxvmf64.
4034 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
4035 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
4036 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
4037
4038 // Extend from nxvmf32 to nxvmf64.
4039 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
4040 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
4041 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
4042
4043 // Bitcasts from float to integer
4044 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
4045 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
4046 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
4047
4048 // Bitcasts from integer to float
4049 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
4050 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
4051 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
4052
4053 // Add cost for extending to illegal -too wide- scalable vectors.
4054 // zero/sign extend are implemented by multiple unpack operations,
4055 // where each operation has a cost of 1.
4056 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
4057 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
4058 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
4059 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
4060 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
4061 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
4062
4063 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
4064 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
4065 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
4066 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
4067 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
4068 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
4069 };
4070
4071 if (const auto *Entry = ConvertCostTableLookup(
4072 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4073 return Entry->Cost;
4074
4075 static const TypeConversionCostTblEntry FP16Tbl[] = {
4076 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
4077 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
4078 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
4079 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
4080 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
4081 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
4082 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
4083 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
4084 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
4085 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
4086 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
4087 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
4088 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
4089 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
4090 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
4091 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
4092 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
4093 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
4094 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
4095 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
4096 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
4097 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
4098 };
4099
4100 if (ST->hasFullFP16())
4101 if (const auto *Entry = ConvertCostTableLookup(
4102 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4103 return Entry->Cost;
4104
4105 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
4106 // double-rounding issues.
4107 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
4108 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
4110 return cast<FixedVectorType>(Dst)->getNumElements() *
4111 getCastInstrCost(Opcode, Dst->getScalarType(),
4112 Src->getScalarType(), CCH, CostKind) +
4114 true, CostKind) +
4116 false, CostKind);
4117
4118 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4120 ST->isSVEorStreamingSVEAvailable() &&
4121 TLI->getTypeAction(Src->getContext(), SrcTy) ==
4123 TLI->getTypeAction(Dst->getContext(), DstTy) ==
4125 // The standard behaviour in the backend for these cases is to split the
4126 // extend up into two parts:
4127 // 1. Perform an extending load or masked load up to the legal type.
4128 // 2. Extend the loaded data to the final type.
4129 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
4130 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
4132 Opcode, LegalTy, Src, CCH, CostKind, I);
4134 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
4135 return Part1 + Part2;
4136 }
4137
4138 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4139 // but we also want to include the TTI::CastContextHint::Masked case too.
4140 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4142 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4144
4145 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4146}
4147
4150 VectorType *VecTy, unsigned Index,
4152
4153 // Make sure we were given a valid extend opcode.
4154 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4155 "Invalid opcode");
4156
4157 // We are extending an element we extract from a vector, so the source type
4158 // of the extend is the element type of the vector.
4159 auto *Src = VecTy->getElementType();
4160
4161 // Sign- and zero-extends are for integer types only.
4162 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4163
4164 // Get the cost for the extract. We compute the cost (if any) for the extend
4165 // below.
4166 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4167 CostKind, Index, nullptr, nullptr);
4168
4169 // Legalize the types.
4170 auto VecLT = getTypeLegalizationCost(VecTy);
4171 auto DstVT = TLI->getValueType(DL, Dst);
4172 auto SrcVT = TLI->getValueType(DL, Src);
4173
4174 // If the resulting type is still a vector and the destination type is legal,
4175 // we may get the extension for free. If not, get the default cost for the
4176 // extend.
4177 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4178 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4179 CostKind);
4180
4181 // The destination type should be larger than the element type. If not, get
4182 // the default cost for the extend.
4183 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4184 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4185 CostKind);
4186
4187 switch (Opcode) {
4188 default:
4189 llvm_unreachable("Opcode should be either SExt or ZExt");
4190
4191 // For sign-extends, we only need a smov, which performs the extension
4192 // automatically.
4193 case Instruction::SExt:
4194 return Cost;
4195
4196 // For zero-extends, the extend is performed automatically by a umov unless
4197 // the destination type is i64 and the element type is i8 or i16.
4198 case Instruction::ZExt:
4199 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4200 return Cost;
4201 }
4202
4203 // If we are unable to perform the extend for free, get the default cost.
4204 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4205 CostKind);
4206}
4207
4210 const Instruction *I) const {
4212 return Opcode == Instruction::PHI ? 0 : 1;
4213 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4214 // Branches are assumed to be predicted.
4215 return 0;
4216}
4217
4218InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4219 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4220 const Instruction *I, Value *Scalar,
4221 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4222 TTI::VectorInstrContext VIC) const {
4223 assert(Val->isVectorTy() && "This must be a vector type");
4224
4225 if (Index != -1U) {
4226 // Legalize the type.
4227 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4228
4229 // This type is legalized to a scalar type.
4230 if (!LT.second.isVector())
4231 return 0;
4232
4233 // The type may be split. For fixed-width vectors we can normalize the
4234 // index to the new type.
4235 if (LT.second.isFixedLengthVector()) {
4236 unsigned Width = LT.second.getVectorNumElements();
4237 Index = Index % Width;
4238 }
4239
4240 // The element at index zero is already inside the vector.
4241 // - For a insert-element or extract-element
4242 // instruction that extracts integers, an explicit FPR -> GPR move is
4243 // needed. So it has non-zero cost.
4244 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4245 return 0;
4246
4247 // This is recognising a LD1 single-element structure to one lane of one
4248 // register instruction. I.e., if this is an `insertelement` instruction,
4249 // and its second operand is a load, then we will generate a LD1, which
4250 // are expensive instructions on some uArchs.
4251 if (VIC == TTI::VectorInstrContext::Load) {
4252 if (ST->hasFastLD1Single())
4253 return 0;
4254 return CostKind == TTI::TCK_CodeSize
4255 ? 0
4257 }
4258
4259 // i1 inserts and extract will include an extra cset or cmp of the vector
4260 // value. Increase the cost by 1 to account.
4261 if (Val->getScalarSizeInBits() == 1)
4262 return CostKind == TTI::TCK_CodeSize
4263 ? 2
4264 : ST->getVectorInsertExtractBaseCost() + 1;
4265
4266 // FIXME:
4267 // If the extract-element and insert-element instructions could be
4268 // simplified away (e.g., could be combined into users by looking at use-def
4269 // context), they have no cost. This is not done in the first place for
4270 // compile-time considerations.
4271 }
4272
4273 // In case of Neon, if there exists extractelement from lane != 0 such that
4274 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4275 // 2. extractelement result feeds into fmul.
4276 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4277 // equivalent to 0.
4278 // then the extractelement can be merged with fmul in the backend and it
4279 // incurs no cost.
4280 // e.g.
4281 // define double @foo(<2 x double> %a) {
4282 // %1 = extractelement <2 x double> %a, i32 0
4283 // %2 = extractelement <2 x double> %a, i32 1
4284 // %res = fmul double %1, %2
4285 // ret double %res
4286 // }
4287 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4288 auto ExtractCanFuseWithFmul = [&]() {
4289 // We bail out if the extract is from lane 0.
4290 if (Index == 0)
4291 return false;
4292
4293 // Check if the scalar element type of the vector operand of ExtractElement
4294 // instruction is one of the allowed types.
4295 auto IsAllowedScalarTy = [&](const Type *T) {
4296 return T->isFloatTy() || T->isDoubleTy() ||
4297 (T->isHalfTy() && ST->hasFullFP16());
4298 };
4299
4300 // Check if the extractelement user is scalar fmul.
4301 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4302 // Check if the user is scalar fmul.
4303 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4304 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4305 !BO->getType()->isVectorTy();
4306 };
4307
4308 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4309 // certain scalar type and a certain vector register width.
4310 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4311 auto RegWidth =
4313 .getFixedValue();
4314 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4315 };
4316
4317 // Check if the type constraints on input vector type and result scalar type
4318 // of extractelement instruction are satisfied.
4319 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4320 return false;
4321
4322 if (Scalar) {
4323 DenseMap<User *, unsigned> UserToExtractIdx;
4324 for (auto *U : Scalar->users()) {
4325 if (!IsUserFMulScalarTy(U))
4326 return false;
4327 // Recording entry for the user is important. Index value is not
4328 // important.
4329 UserToExtractIdx[U];
4330 }
4331 if (UserToExtractIdx.empty())
4332 return false;
4333 for (auto &[S, U, L] : ScalarUserAndIdx) {
4334 for (auto *U : S->users()) {
4335 if (UserToExtractIdx.contains(U)) {
4336 auto *FMul = cast<BinaryOperator>(U);
4337 auto *Op0 = FMul->getOperand(0);
4338 auto *Op1 = FMul->getOperand(1);
4339 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4340 UserToExtractIdx[U] = L;
4341 break;
4342 }
4343 }
4344 }
4345 }
4346 for (auto &[U, L] : UserToExtractIdx) {
4347 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4348 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4349 return false;
4350 }
4351 } else {
4352 const auto *EE = cast<ExtractElementInst>(I);
4353
4354 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4355 if (!IdxOp)
4356 return false;
4357
4358 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4359 if (!IsUserFMulScalarTy(U))
4360 return false;
4361
4362 // Check if the other operand of extractelement is also extractelement
4363 // from lane equivalent to 0.
4364 const auto *BO = cast<BinaryOperator>(U);
4365 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4366 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4367 if (OtherEE) {
4368 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4369 if (!IdxOp)
4370 return false;
4371 return IsExtractLaneEquivalentToZero(
4372 cast<ConstantInt>(OtherEE->getIndexOperand())
4373 ->getValue()
4374 .getZExtValue(),
4375 OtherEE->getType()->getScalarSizeInBits());
4376 }
4377 return true;
4378 });
4379 }
4380 return true;
4381 };
4382
4383 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4384 ExtractCanFuseWithFmul())
4385 return 0;
4386
4387 // All other insert/extracts cost this much.
4388 return CostKind == TTI::TCK_CodeSize ? 1
4389 : ST->getVectorInsertExtractBaseCost();
4390}
4391
4393 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4394 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4395 // Treat insert at lane 0 into a poison vector as having zero cost. This
4396 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4397 // single dup) are treated as cheap.
4398 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4399 isa<PoisonValue>(Op0))
4400 return 0;
4401 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4402 nullptr, {}, VIC);
4403}
4404
4406 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4407 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4408 TTI::VectorInstrContext VIC) const {
4409 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4410 ScalarUserAndIdx, VIC);
4411}
4412
4415 TTI::TargetCostKind CostKind, unsigned Index,
4416 TTI::VectorInstrContext VIC) const {
4417 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4418 nullptr, {}, VIC);
4419}
4420
4424 unsigned Index) const {
4425 if (isa<FixedVectorType>(Val))
4427 Index);
4428
4429 // This typically requires both while and lastb instructions in order
4430 // to extract the last element. If this is in a loop the while
4431 // instruction can at least be hoisted out, although it will consume a
4432 // predicate register. The cost should be more expensive than the base
4433 // extract cost, which is 2 for most CPUs.
4434 return CostKind == TTI::TCK_CodeSize
4435 ? 2
4436 : ST->getVectorInsertExtractBaseCost() + 1;
4437}
4438
4440 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4441 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4442 TTI::VectorInstrContext VIC) const {
4445 if (Ty->getElementType()->isFloatingPointTy())
4446 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4447 CostKind);
4448 unsigned VecInstCost =
4449 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4450 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4451}
4452
4453std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4455 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4456 std::function<InstructionCost(Type *)> InstCost) const {
4457 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4458 return std::nullopt;
4459 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4460 return std::nullopt;
4461 // If we have +sve-b16b16 the operation can be promoted to SVE.
4462 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4463 return std::nullopt;
4464
4465 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4466 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4468 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4469 Cost *= 2;
4470 Cost += InstCost(PromotedTy);
4471 if (IncludeTrunc)
4472 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4474 return Cost;
4475}
4476
4478 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4480 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4481
4482 // The code-generator is currently not able to handle scalable vectors
4483 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4484 // it. This change will be removed when code-generation for these types is
4485 // sufficiently reliable.
4486 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4487 if (VTy->getElementCount() == ElementCount::getScalable(1))
4489
4490 // TODO: Handle more cost kinds.
4492 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4493 Op2Info, Args, CxtI);
4494
4495 // Legalize the type.
4496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4497 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4498
4499 // Increase the cost for half and bfloat types if not architecturally
4500 // supported.
4501 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4502 ISD == ISD::FDIV || ISD == ISD::FREM) {
4503 if (auto PromotedCost = getFP16BF16PromoteCost(
4504 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4505 // There is not native support for fdiv/frem even with +sve-b16b16.
4506 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4507 [&](Type *PromotedTy) {
4508 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4509 Op1Info, Op2Info);
4510 }))
4511 return *PromotedCost;
4512
4513 // fp128 all go via libcalls
4514 if (Ty->getScalarType()->isFP128Ty())
4515 return (CostKind == TTI::TCK_CodeSize ? 1 : 10) * LT.first;
4516 }
4517
4518 // If the operation is a widening instruction (smull or umull) and both
4519 // operands are extends the cost can be cheaper by considering that the
4520 // operation will operate on the narrowest type size possible (double the
4521 // largest input size) and a further extend.
4522 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4523 if (ExtTy != Ty)
4524 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4525 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4527 return LT.first;
4528 }
4529
4530 switch (ISD) {
4531 default:
4532 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4533 Op2Info);
4534 case ISD::ADD:
4535 case ISD::SUB:
4536 return LT.first; // Also works for i128
4537 case ISD::MUL:
4538 if (LT.second == MVT::v2i64) {
4539 // When SVE is available, then we can lower the v2i64 operation using
4540 // the SVE mul instruction, which has a lower cost.
4541 if (ST->hasSVE())
4542 return LT.first;
4543
4544 // When SVE is not available, there is no MUL.2d instruction,
4545 // which means mul <2 x i64> is expensive as elements are extracted
4546 // from the vectors and the muls scalarized.
4547 // As getScalarizationOverhead is a bit too pessimistic, we
4548 // estimate the cost for a i64 vector directly here, which is:
4549 // - four 2-cost i64 extracts,
4550 // - two 2-cost i64 inserts, and
4551 // - two 1-cost muls.
4552 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4553 // LT.first = 2 the cost is 28.
4554 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4555 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4556 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4557 nullptr, nullptr) *
4558 2 +
4559 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4560 nullptr, nullptr));
4561 }
4562 return LT.first;
4563 case ISD::SREM:
4564 case ISD::SDIV:
4565 /*
4566 Notes for sdiv/srem specific costs:
4567 1. This only considers the cases where the divisor is constant, uniform and
4568 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4569 result in some form of (ldr + adrp), corresponding to constant vectors, or
4570 scalarization of the division operation.
4571 2. Constant divisors, either negative in whole or partially, don't result in
4572 significantly different codegen as compared to positive constant divisors.
4573 So, we don't consider negative divisors separately.
4574 3. If the codegen is significantly different with SVE, it has been indicated
4575 using comments at appropriate places.
4576
4577 sdiv specific cases:
4578 -----------------------------------------------------------------------
4579 codegen | pow-of-2 | Type
4580 -----------------------------------------------------------------------
4581 add + cmp + csel + asr | Y | i64
4582 add + cmp + csel + asr | Y | i32
4583 -----------------------------------------------------------------------
4584
4585 srem specific cases:
4586 -----------------------------------------------------------------------
4587 codegen | pow-of-2 | Type
4588 -----------------------------------------------------------------------
4589 negs + and + and + csneg | Y | i64
4590 negs + and + and + csneg | Y | i32
4591 -----------------------------------------------------------------------
4592
4593 other sdiv/srem cases:
4594 -------------------------------------------------------------------------
4595 common codegen | + srem | + sdiv | pow-of-2 | Type
4596 -------------------------------------------------------------------------
4597 smulh + asr + add + add | - | - | N | i64
4598 smull + lsr + add + add | - | - | N | i32
4599 usra | and + sub | sshr | Y | <2 x i64>
4600 2 * (scalar code) | - | - | N | <2 x i64>
4601 usra | bic + sub | sshr + neg | Y | <4 x i32>
4602 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4603 + sshr + usra | | | |
4604 -------------------------------------------------------------------------
4605 */
4606 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4607 InstructionCost AddCost =
4608 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4609 Op1Info.getNoProps(), Op2Info.getNoProps());
4610 InstructionCost AsrCost =
4611 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4612 Op1Info.getNoProps(), Op2Info.getNoProps());
4613 InstructionCost MulCost =
4614 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4615 Op1Info.getNoProps(), Op2Info.getNoProps());
4616 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4617 // have similar cost.
4618 auto VT = TLI->getValueType(DL, Ty);
4619 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4620 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4621 // Neg can be folded into the asr instruction.
4622 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4623 : (3 * AsrCost + AddCost);
4624 } else {
4625 return MulCost + AsrCost + 2 * AddCost;
4626 }
4627 } else if (VT.isVector()) {
4628 InstructionCost UsraCost = 2 * AsrCost;
4629 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4630 // Division with scalable types corresponds to native 'asrd'
4631 // instruction when SVE is available.
4632 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4633
4634 // One more for the negation in SDIV
4636 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4637 if (Ty->isScalableTy() && ST->hasSVE())
4638 Cost += 2 * AsrCost;
4639 else {
4640 Cost +=
4641 UsraCost +
4642 (ISD == ISD::SDIV
4643 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4644 : 2 * AddCost);
4645 }
4646 return Cost;
4647 } else if (LT.second == MVT::v2i64) {
4648 return VT.getVectorNumElements() *
4649 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4650 Op1Info.getNoProps(),
4651 Op2Info.getNoProps());
4652 } else {
4653 // When SVE is available, we get:
4654 // smulh + lsr + add/sub + asr + add/sub.
4655 if (Ty->isScalableTy() && ST->hasSVE())
4656 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4657 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4658 }
4659 }
4660 }
4661 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4662 LT.second.isFixedLengthVector()) {
4663 // FIXME: When the constant vector is non-uniform, this may result in
4664 // loading the vector from constant pool or in some cases, may also result
4665 // in scalarization. For now, we are approximating this with the
4666 // scalarization cost.
4667 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4668 CostKind, -1, nullptr, nullptr);
4669 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4670 CostKind, -1, nullptr, nullptr);
4671 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4672 return ExtractCost + InsertCost +
4673 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4674 CostKind, Op1Info.getNoProps(),
4675 Op2Info.getNoProps());
4676 }
4677 [[fallthrough]];
4678 case ISD::UDIV:
4679 case ISD::UREM: {
4680 auto VT = TLI->getValueType(DL, Ty);
4681 if (Op2Info.isConstant()) {
4682 // If the operand is a power of 2 we can use the shift or and cost.
4683 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4684 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4685 Op1Info.getNoProps(),
4686 Op2Info.getNoProps());
4687 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4688 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4689 Op1Info.getNoProps(),
4690 Op2Info.getNoProps());
4691
4692 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4693 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4694 // The MULHU will be expanded to UMULL for the types not listed below,
4695 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4696 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4697 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4698 LT.second == MVT::nxv16i8;
4699 bool Is128bit = LT.second.is128BitVector();
4700
4701 InstructionCost MulCost =
4702 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4703 Op1Info.getNoProps(), Op2Info.getNoProps());
4704 InstructionCost AddCost =
4705 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4706 Op1Info.getNoProps(), Op2Info.getNoProps());
4707 InstructionCost ShrCost =
4708 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4709 Op1Info.getNoProps(), Op2Info.getNoProps());
4710 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4711 (HasMULH ? 0 : ShrCost) + // UMULL shift
4712 AddCost * 2 + ShrCost;
4713 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4714 }
4715 }
4716
4717 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4718 // emitted by the backend even when those functions are not declared in the
4719 // module.
4720 if (!VT.isVector() && VT.getSizeInBits() > 64)
4721 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4722
4724 Opcode, Ty, CostKind, Op1Info, Op2Info);
4725 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4726 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4727 // SDIV/UDIV operations are lowered using SVE, then we can have less
4728 // costs.
4729 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4730 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4731 static const CostTblEntry DivTbl[]{
4732 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4733 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4734 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4735 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4736 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4737 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4738
4739 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4740 if (nullptr != Entry)
4741 return Entry->Cost;
4742 }
4743 // For 8/16-bit elements, the cost is higher because the type
4744 // requires promotion and possibly splitting:
4745 if (LT.second.getScalarType() == MVT::i8)
4746 Cost *= 8;
4747 else if (LT.second.getScalarType() == MVT::i16)
4748 Cost *= 4;
4749 return Cost;
4750 } else {
4751 // If one of the operands is a uniform constant then the cost for each
4752 // element is Cost for insertion, extraction and division.
4753 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4754 // operation with scalar type
4755 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4756 (Op2Info.isConstant() && Op2Info.isUniform())) {
4757 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4759 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4760 return (4 + DivCost) * VTy->getNumElements();
4761 }
4762 }
4763 // On AArch64, without SVE, vector divisions are expanded
4764 // into scalar divisions of each pair of elements.
4765 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4766 -1, nullptr, nullptr);
4767 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4768 nullptr, nullptr);
4769 }
4770
4771 // TODO: if one of the arguments is scalar, then it's not necessary to
4772 // double the cost of handling the vector elements.
4773 Cost += Cost;
4774 }
4775 return Cost;
4776 }
4777 case ISD::XOR:
4778 case ISD::OR:
4779 case ISD::AND:
4780 case ISD::SRL:
4781 case ISD::SRA:
4782 case ISD::SHL:
4783 // These nodes are marked as 'custom' for combining purposes only.
4784 // We know that they are legal. See LowerAdd in ISelLowering.
4785 return LT.first;
4786
4787 case ISD::FNEG:
4788 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4789 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4790 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4791 CxtI &&
4792 ((CxtI->hasOneUse() &&
4793 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4794 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4795 return 0;
4796 [[fallthrough]];
4797 case ISD::FADD:
4798 case ISD::FSUB:
4799 if (!Ty->getScalarType()->isFP128Ty())
4800 return LT.first;
4801 [[fallthrough]];
4802 case ISD::FMUL:
4803 case ISD::FDIV:
4804 // These nodes are marked as 'custom' just to lower them to SVE.
4805 // We know said lowering will incur no additional cost.
4806 if (!Ty->getScalarType()->isFP128Ty())
4807 return 2 * LT.first;
4808
4809 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4810 Op2Info);
4811 case ISD::FREM:
4812 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4813 // those functions are not declared in the module.
4814 if (!Ty->isVectorTy())
4815 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4816 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4817 Op2Info);
4818 }
4819}
4820
4823 const SCEV *Ptr,
4825 // Address computations in vectorized code with non-consecutive addresses will
4826 // likely result in more instructions compared to scalar code where the
4827 // computation can more often be merged into the index mode. The resulting
4828 // extra micro-ops can significantly decrease throughput.
4829 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4830 int MaxMergeDistance = 64;
4831
4832 if (PtrTy->isVectorTy() && SE &&
4833 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4834 return NumVectorInstToHideOverhead;
4835
4836 // In many cases the address computation is not merged into the instruction
4837 // addressing mode.
4838 return 1;
4839}
4840
4841/// Check whether Opcode1 has less throughput according to the scheduling
4842/// model than Opcode2.
4844 unsigned Opcode1, unsigned Opcode2) const {
4845 const MCSchedModel &Sched = ST->getSchedModel();
4846 const TargetInstrInfo *TII = ST->getInstrInfo();
4847 if (!Sched.hasInstrSchedModel())
4848 return false;
4849
4850 const MCSchedClassDesc *SCD1 =
4851 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4852 const MCSchedClassDesc *SCD2 =
4853 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4854 // We cannot handle variant scheduling classes without an MI. If we need to
4855 // support them for any of the instructions we query the information of we
4856 // might need to add a way to resolve them without a MI or not use the
4857 // scheduling info.
4858 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4859 "Cannot handle variant scheduling classes without an MI");
4860 if (!SCD1->isValid() || !SCD2->isValid())
4861 return false;
4862
4863 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4865}
4866
4868 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4870 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4871 // We don't lower some vector selects well that are wider than the register
4872 // width. TODO: Improve this with different cost kinds.
4873 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4874 // We would need this many instructions to hide the scalarization happening.
4875 const int AmortizationCost = 20;
4876
4877 // If VecPred is not set, check if we can get a predicate from the context
4878 // instruction, if its type matches the requested ValTy.
4879 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4880 CmpPredicate CurrentPred;
4881 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4882 m_Value())))
4883 VecPred = CurrentPred;
4884 }
4885 // Check if we have a compare/select chain that can be lowered using
4886 // a (F)CMxx & BFI pair.
4887 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4888 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4889 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4890 VecPred == CmpInst::FCMP_UNE) {
4891 static const auto ValidMinMaxTys = {
4892 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4893 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4894 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4895
4896 auto LT = getTypeLegalizationCost(ValTy);
4897 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4898 (ST->hasFullFP16() &&
4899 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4900 return LT.first;
4901 }
4902
4903 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4904 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4905 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4906 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4907 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4908 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4909 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4910 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4911 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4912 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4913 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4914 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4915
4916 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4917 EVT SelValTy = TLI->getValueType(DL, ValTy);
4918 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4919 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4920 SelCondTy.getSimpleVT(),
4921 SelValTy.getSimpleVT()))
4922 return Entry->Cost;
4923 }
4924 }
4925
4926 if (Opcode == Instruction::FCmp) {
4927 if (auto PromotedCost = getFP16BF16PromoteCost(
4928 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4929 // TODO: Consider costing SVE FCMPs.
4930 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4932 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4933 CostKind, Op1Info, Op2Info);
4934 if (isa<VectorType>(PromotedTy))
4936 Instruction::Trunc,
4940 return Cost;
4941 }))
4942 return *PromotedCost;
4943
4944 auto LT = getTypeLegalizationCost(ValTy);
4945 // Model unknown fp compares as a libcall.
4946 if (LT.second.getScalarType() != MVT::f64 &&
4947 LT.second.getScalarType() != MVT::f32 &&
4948 LT.second.getScalarType() != MVT::f16)
4949 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4950 {ValTy, ValTy}, CostKind);
4951
4952 // Some comparison operators require expanding to multiple compares + or.
4953 unsigned Factor = 1;
4954 if (!CondTy->isVectorTy() &&
4955 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4956 Factor = 2; // fcmp with 2 selects
4957 else if (isa<FixedVectorType>(ValTy) &&
4958 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4959 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4960 Factor = 3; // fcmxx+fcmyy+or
4961 else if (isa<ScalableVectorType>(ValTy) &&
4962 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4963 Factor = 3; // fcmxx+fcmyy+or
4964
4965 if (isa<ScalableVectorType>(ValTy) &&
4967 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4968 AArch64::FCMEQv4f32))
4969 Factor *= 2;
4970
4971 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4972 }
4973
4974 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4975 // icmp(and, 0) as free, as we can make use of ands, but only if the
4976 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4977 // providing it will not cause performance regressions.
4978 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4979 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4980 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4981 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4982 if (match(I->getOperand(1), m_Zero()))
4983 return 0;
4984
4985 // x >= 1 / x < 1 -> x > 0 / x <= 0
4986 if (match(I->getOperand(1), m_One()) &&
4987 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4988 return 0;
4989
4990 // x <= -1 / x > -1 -> x > 0 / x <= 0
4991 if (match(I->getOperand(1), m_AllOnes()) &&
4992 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4993 return 0;
4994 }
4995
4996 // The base case handles scalable vectors fine for now, since it treats the
4997 // cost as 1 * legalization cost.
4998 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4999 Op1Info, Op2Info, I);
5000}
5001
5003AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
5005 if (ST->requiresStrictAlign()) {
5006 // TODO: Add cost modeling for strict align. Misaligned loads expand to
5007 // a bunch of instructions when strict align is enabled.
5008 return Options;
5009 }
5010 Options.AllowOverlappingLoads = true;
5011 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
5012 Options.NumLoadsPerBlock = Options.MaxNumLoads;
5013 // TODO: Though vector loads usually perform well on AArch64, in some targets
5014 // they may wake up the FP unit, which raises the power consumption. Perhaps
5015 // they could be used with no holds barred (-O3).
5016 Options.LoadSizes = {8, 4, 2, 1};
5017 Options.AllowedTailExpansions = {3, 5, 6};
5018 return Options;
5019}
5020
5022 return ST->hasSVE();
5023}
5024
5028 switch (MICA.getID()) {
5029 case Intrinsic::masked_scatter:
5030 case Intrinsic::masked_gather:
5031 return getGatherScatterOpCost(MICA, CostKind);
5032 case Intrinsic::masked_load:
5033 case Intrinsic::masked_expandload:
5034 case Intrinsic::masked_store:
5035 return getMaskedMemoryOpCost(MICA, CostKind);
5036 }
5038}
5039
5043 Type *Src = MICA.getDataType();
5044
5045 if (useNeonVector(Src))
5047 auto LT = getTypeLegalizationCost(Src);
5048 if (!LT.first.isValid())
5050
5051 // Return an invalid cost for element types that we are unable to lower.
5052 auto *VT = cast<VectorType>(Src);
5053 if (VT->getElementType()->isIntegerTy(1))
5055
5056 // The code-generator is currently not able to handle scalable vectors
5057 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5058 // it. This change will be removed when code-generation for these types is
5059 // sufficiently reliable.
5060 if (VT->getElementCount() == ElementCount::getScalable(1))
5062
5063 InstructionCost MemOpCost = LT.first;
5064 if (MICA.getID() == Intrinsic::masked_expandload) {
5065 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
5067
5068 // Operation will be split into expand of masked.load
5069 MemOpCost *= 2;
5070 }
5071
5072 // If we need to split the memory operation, we will also need to split the
5073 // mask. This will likely lead to overestimating the cost in some cases if
5074 // multiple memory operations use the same mask, but we often don't have
5075 // enough context to figure that out here.
5076 //
5077 // If the elements being loaded are bytes then the mask will already be split,
5078 // since the number of bits in a P register matches the number of bytes in a
5079 // Z register.
5080 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
5081 return MemOpCost * 2;
5082
5083 return MemOpCost;
5084}
5085
5086// This function returns gather/scatter overhead either from
5087// user-provided value or specialized values per-target from \p ST.
5088static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
5089 const AArch64Subtarget *ST) {
5090 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5091 "Should be called on only load or stores.");
5092 switch (Opcode) {
5093 case Instruction::Load:
5094 if (SVEGatherOverhead.getNumOccurrences() > 0)
5095 return SVEGatherOverhead;
5096 return ST->getGatherOverhead();
5097 break;
5098 case Instruction::Store:
5099 if (SVEScatterOverhead.getNumOccurrences() > 0)
5100 return SVEScatterOverhead;
5101 return ST->getScatterOverhead();
5102 break;
5103 default:
5104 llvm_unreachable("Shouldn't have reached here");
5105 }
5106}
5107
5111
5112 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
5113 MICA.getID() == Intrinsic::vp_gather)
5114 ? Instruction::Load
5115 : Instruction::Store;
5116
5117 Type *DataTy = MICA.getDataType();
5118 Align Alignment = MICA.getAlignment();
5119 const Instruction *I = MICA.getInst();
5120
5121 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
5123 auto *VT = cast<VectorType>(DataTy);
5124 auto LT = getTypeLegalizationCost(DataTy);
5125 if (!LT.first.isValid())
5127
5128 // Return an invalid cost for element types that we are unable to lower.
5129 if (!LT.second.isVector() ||
5130 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
5131 VT->getElementType()->isIntegerTy(1))
5133
5134 // The code-generator is currently not able to handle scalable vectors
5135 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5136 // it. This change will be removed when code-generation for these types is
5137 // sufficiently reliable.
5138 if (VT->getElementCount() == ElementCount::getScalable(1))
5140
5141 ElementCount LegalVF = LT.second.getVectorElementCount();
5142 InstructionCost MemOpCost =
5143 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5144 {TTI::OK_AnyValue, TTI::OP_None}, I);
5145 // Add on an overhead cost for using gathers/scatters.
5146 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5147 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5148}
5149
5151 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5152}
5153
5155 Align Alignment,
5156 unsigned AddressSpace,
5158 TTI::OperandValueInfo OpInfo,
5159 const Instruction *I) const {
5160 EVT VT = TLI->getValueType(DL, Ty, true);
5161 // Type legalization can't handle structs
5162 if (VT == MVT::Other)
5163 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5164 CostKind);
5165
5166 auto LT = getTypeLegalizationCost(Ty);
5167 if (!LT.first.isValid())
5169
5170 // The code-generator is currently not able to handle scalable vectors
5171 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5172 // it. This change will be removed when code-generation for these types is
5173 // sufficiently reliable.
5174 // We also only support full register predicate loads and stores.
5175 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5176 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5177 (VTy->getElementType()->isIntegerTy(1) &&
5178 !VTy->getElementCount().isKnownMultipleOf(
5181
5182 // TODO: consider latency as well for TCK_SizeAndLatency.
5184 return LT.first;
5185
5186 if (CostKind == TTI::TCK_Latency) {
5187 // Latency doesn't make much sense for stores, so just return 1
5188 if (Opcode == Instruction::Store)
5189 return 1;
5190 // If the subtarget has overridden the load latency then use that instead of
5191 // querying the SchedModel.
5192 if (ST->getFixedLoadLatency())
5193 return (LT.first - 1) + ST->getFixedLoadLatency();
5194 // We expect the load to become LT.first loads of type LT.second. The
5195 // latency will be the latency of the last load plus the time it gets to get
5196 // there, which will be the amount of other loads before that (i.e. total
5197 // loads - 1) multiplied by how long it takes to get through them (the
5198 // reciprocal of the throughput). We get the latency and reciprocal
5199 // throughput from the SchedModel, and assume that the loads become the
5200 // variant with unsigned integer offset.
5201 unsigned Inst = 0;
5202 if (LT.second.isScalableVector() ||
5203 ST->useSVEForFixedLengthVectors(LT.second)) {
5204 Inst = AArch64::LDR_ZXI;
5205 } else if (LT.second.isVector() || LT.second.isFloatingPoint()) {
5206 switch (LT.second.getSizeInBits()) {
5207 case 8:
5208 Inst = AArch64::LDRBui;
5209 break;
5210 case 16:
5211 Inst = AArch64::LDRHui;
5212 break;
5213 case 32:
5214 Inst = AArch64::LDRSui;
5215 break;
5216 case 64:
5217 Inst = AArch64::LDRDui;
5218 break;
5219 case 128:
5220 Inst = AArch64::LDRQui;
5221 break;
5222 default:
5223 llvm_unreachable("Unexpected float or vector type");
5224 }
5225 } else {
5226 switch (LT.second.getSizeInBits()) {
5227 case 8:
5228 Inst = AArch64::LDRBBui;
5229 break;
5230 case 16:
5231 Inst = AArch64::LDRHHui;
5232 break;
5233 case 32:
5234 Inst = AArch64::LDRWui;
5235 break;
5236 case 64:
5237 Inst = AArch64::LDRXui;
5238 break;
5239 default:
5240 llvm_unreachable("Unexpected integer type");
5241 }
5242 }
5243 const MCSchedModel &Sched = ST->getSchedModel();
5244 const TargetInstrInfo *TII = ST->getInstrInfo();
5245 unsigned SchedClass = TII->get(Inst).getSchedClass();
5246 const MCSchedClassDesc *SCD = Sched.getSchedClassDesc(SchedClass);
5247 // We need to convert the number of loads before the last to a float here,
5248 // as the reciprocal throughput may be fractional.
5249 float NumLoads = (LT.first - 1).getValue();
5250 return NumLoads * Sched.getReciprocalThroughput(*ST, *SCD) +
5251 Sched.computeInstrLatency(*ST, *SCD);
5252 }
5253
5254 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5255 LT.second.is128BitVector() && Alignment < Align(16)) {
5256 // Unaligned stores are extremely inefficient. We don't split all
5257 // unaligned 128-bit stores because the negative impact that has shown in
5258 // practice on inlined block copy code.
5259 // We make such stores expensive so that we will only vectorize if there
5260 // are 6 other instructions getting vectorized.
5261 const int AmortizationCost = 6;
5262
5263 return LT.first * 2 * AmortizationCost;
5264 }
5265
5266 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5267 if (Ty->isPtrOrPtrVectorTy())
5268 return LT.first;
5269
5270 if (useNeonVector(Ty)) {
5271 // Check truncating stores and extending loads.
5272 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5273 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5274 if (VT == MVT::v4i8)
5275 return 2;
5276 // Otherwise we need to scalarize.
5277 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5278 }
5279 EVT EltVT = VT.getVectorElementType();
5280 unsigned EltSize = EltVT.getScalarSizeInBits();
5281 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5282 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5283 return LT.first;
5284 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5285 // widening to v4i8, which produces suboptimal results.
5286 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5287 return LT.first;
5288
5289 // Check non-power-of-2 loads/stores for legal vector element types with
5290 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5291 // operations on smaller power-of-2 ops, including ld1/st1.
5292 LLVMContext &C = Ty->getContext();
5294 SmallVector<EVT> TypeWorklist;
5295 TypeWorklist.push_back(VT);
5296 while (!TypeWorklist.empty()) {
5297 EVT CurrVT = TypeWorklist.pop_back_val();
5298 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5299 if (isPowerOf2_32(CurrNumElements)) {
5300 Cost += 1;
5301 continue;
5302 }
5303
5304 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5305 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5306 TypeWorklist.push_back(
5307 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5308 }
5309 return Cost;
5310 }
5311
5312 return LT.first;
5313}
5314
5316 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5317 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5318 bool UseMaskForCond, bool UseMaskForGaps) const {
5319 assert(Factor >= 2 && "Invalid interleave factor");
5320 auto *VecVTy = cast<VectorType>(VecTy);
5321
5322 if (VecTy->isScalableTy() && !ST->hasSVE())
5324
5325 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5326 // only have lowering for power-of-2 factors.
5327 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5328 // InterleavedAccessPass for ld3/st3
5329 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5331
5332 // Vectorization for masked interleaved accesses is only enabled for scalable
5333 // VF.
5334 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5336
5337 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5338 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5339 auto *SubVecTy =
5340 VectorType::get(VecVTy->getElementType(),
5341 VecVTy->getElementCount().divideCoefficientBy(Factor));
5342
5343 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5344 // Accesses having vector types that are a multiple of 128 bits can be
5345 // matched to more than one ldN/stN instruction.
5346 bool UseScalable;
5347 if (MinElts % Factor == 0 &&
5348 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5349 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5350 }
5351
5352 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5353 Alignment, AddressSpace, CostKind,
5354 UseMaskForCond, UseMaskForGaps);
5355}
5356
5361 for (auto *I : Tys) {
5362 if (!I->isVectorTy())
5363 continue;
5364 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5365 128)
5366 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5367 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5368 }
5369 return Cost;
5370}
5371
5373 Align Alignment) const {
5374 // Neon types should be scalarised when we are not choosing to use SVE.
5375 if (useNeonVector(DataTy))
5376 return false;
5377
5378 // Return true only if we are able to lower using the SVE2p2/SME2p2
5379 // expand instruction.
5380 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5381 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5382}
5383
5384unsigned
5386 bool HasUnorderedReductions) const {
5387 if (VF.isScalar() || (HasUnorderedReductions && VF.getKnownMinValue() <= 4))
5388 return 4;
5389 return ST->getMaxInterleaveFactor();
5390}
5391
5392// For Falkor, we want to avoid having too many strided loads in a loop since
5393// that can exhaust the HW prefetcher resources. We adjust the unroller
5394// MaxCount preference below to attempt to ensure unrolling doesn't create too
5395// many strided loads.
5396static void
5399 enum { MaxStridedLoads = 7 };
5400 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5401 int StridedLoads = 0;
5402 // FIXME? We could make this more precise by looking at the CFG and
5403 // e.g. not counting loads in each side of an if-then-else diamond.
5404 for (const auto BB : L->blocks()) {
5405 for (auto &I : *BB) {
5406 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5407 if (!LMemI)
5408 continue;
5409
5410 Value *PtrValue = LMemI->getPointerOperand();
5411 if (L->isLoopInvariant(PtrValue))
5412 continue;
5413
5414 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5415 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5416 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5417 continue;
5418
5419 // FIXME? We could take pairing of unrolled load copies into account
5420 // by looking at the AddRec, but we would probably have to limit this
5421 // to loops with no stores or other memory optimization barriers.
5422 ++StridedLoads;
5423 // We've seen enough strided loads that seeing more won't make a
5424 // difference.
5425 if (StridedLoads > MaxStridedLoads / 2)
5426 return StridedLoads;
5427 }
5428 }
5429 return StridedLoads;
5430 };
5431
5432 int StridedLoads = countStridedLoads(L, SE);
5433 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5434 << " strided loads\n");
5435 // Pick the largest power of 2 unroll count that won't result in too many
5436 // strided loads.
5437 if (StridedLoads) {
5438 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5439 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5440 << UP.MaxCount << '\n');
5441 }
5442}
5443
5444// This function returns true if the loop:
5445// 1. Has a valid cost, and
5446// 2. Has a cost within the supplied budget.
5447// Otherwise it returns false.
5449 InstructionCost Budget,
5450 unsigned *FinalSize) {
5451 // Estimate the size of the loop.
5452 InstructionCost LoopCost = 0;
5453
5454 for (auto *BB : L->getBlocks()) {
5455 for (auto &I : *BB) {
5456 SmallVector<const Value *, 4> Operands(I.operand_values());
5457 InstructionCost Cost =
5458 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5459 // This can happen with intrinsics that don't currently have a cost model
5460 // or for some operations that require SVE.
5461 if (!Cost.isValid())
5462 return false;
5463
5464 LoopCost += Cost;
5465 if (LoopCost > Budget)
5466 return false;
5467 }
5468 }
5469
5470 if (FinalSize)
5471 *FinalSize = LoopCost.getValue();
5472 return true;
5473}
5474
5476 const AArch64TTIImpl &TTI) {
5477 // Only consider loops with unknown trip counts for which we can determine
5478 // a symbolic expression. Multi-exit loops with small known trip counts will
5479 // likely be unrolled anyway.
5480 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5482 return false;
5483
5484 // It might not be worth unrolling loops with low max trip counts. Restrict
5485 // this to max trip counts > 32 for now.
5486 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5487 if (MaxTC > 0 && MaxTC <= 32)
5488 return false;
5489
5490 // Make sure the loop size is <= 5.
5491 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5492 return false;
5493
5494 // Small search loops with multiple exits can be highly beneficial to unroll.
5495 // We only care about loops with exactly two exiting blocks, although each
5496 // block could jump to the same exit block.
5497 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5498 if (Blocks.size() != 2)
5499 return false;
5500
5501 if (any_of(Blocks, [](BasicBlock *BB) {
5503 }))
5504 return false;
5505
5506 return true;
5507}
5508
5509/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5510/// OOO engine's wide instruction window and various predictors.
5511static void
5514 const AArch64TTIImpl &TTI) {
5515 // Limit loops with structure that is highly likely to benefit from runtime
5516 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5517 // likely with complex control flow). Note that the heuristics here may be
5518 // overly conservative and we err on the side of avoiding runtime unrolling
5519 // rather than unroll excessively. They are all subject to further refinement.
5520 if (!L->isInnermost() || L->getNumBlocks() > 8)
5521 return;
5522
5523 // Loops with multiple exits are handled by common code.
5524 if (!L->getExitBlock())
5525 return;
5526
5527 // Check if the loop contains any reductions that could be parallelized when
5528 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5529 // a multiple of 2.
5530 bool HasParellelizableReductions =
5531 L->getNumBlocks() == 1 &&
5532 any_of(L->getHeader()->phis(),
5533 [&SE, L](PHINode &Phi) {
5534 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5535 }) &&
5536 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5537 if (HasParellelizableReductions &&
5538 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5539 UP.Partial = true;
5540 UP.MaxCount = 4;
5541 UP.AddAdditionalAccumulators = true;
5542 }
5543
5544 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5546 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5547 SE.getSmallConstantMaxTripCount(L) <= 32))
5548 return;
5549
5550 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5551 return;
5552
5554 return;
5555
5556 // Limit to loops with trip counts that are cheap to expand.
5557 UP.SCEVExpansionBudget = 1;
5558
5559 if (HasParellelizableReductions) {
5560 UP.Runtime = true;
5562 UP.AddAdditionalAccumulators = true;
5563 }
5564
5565 // Try to unroll small loops, of few-blocks with low budget, if they have
5566 // load/store dependencies, to expose more parallel memory access streams,
5567 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5568 BasicBlock *Header = L->getHeader();
5569 BasicBlock *Latch = L->getLoopLatch();
5570 if (Header == Latch) {
5571 // Estimate the size of the loop.
5572 unsigned Size;
5573 unsigned Width = 10;
5574 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5575 return;
5576
5577 // Try to find an unroll count that maximizes the use of the instruction
5578 // window, i.e. trying to fetch as many instructions per cycle as possible.
5579 unsigned MaxInstsPerLine = 16;
5580 unsigned UC = 1;
5581 unsigned BestUC = 1;
5582 unsigned SizeWithBestUC = BestUC * Size;
5583 while (UC <= 8) {
5584 unsigned SizeWithUC = UC * Size;
5585 if (SizeWithUC > 48)
5586 break;
5587 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5588 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5589 BestUC = UC;
5590 SizeWithBestUC = BestUC * Size;
5591 }
5592 UC++;
5593 }
5594
5595 if (BestUC == 1)
5596 return;
5597
5598 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5600 for (auto *BB : L->blocks()) {
5601 for (auto &I : *BB) {
5603 if (!Ptr)
5604 continue;
5605 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5606 if (SE.isLoopInvariant(PtrSCEV, L))
5607 continue;
5608 if (isa<LoadInst>(&I)) {
5609 LoadedValuesPlus.insert(&I);
5610 // Include in-loop 1st users of loaded values.
5611 for (auto *U : I.users())
5612 if (L->contains(cast<Instruction>(U)))
5613 LoadedValuesPlus.insert(U);
5614 } else
5615 Stores.push_back(cast<StoreInst>(&I));
5616 }
5617 }
5618
5619 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5620 return LoadedValuesPlus.contains(SI->getOperand(0));
5621 }))
5622 return;
5623
5624 UP.Runtime = true;
5625 UP.DefaultUnrollRuntimeCount = BestUC;
5626 return;
5627 }
5628
5629 // Try to runtime-unroll loops with early-continues depending on loop-varying
5630 // loads; this helps with branch-prediction for the early-continues.
5631 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5633 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5634 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5635 return;
5636
5637 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5638 [&](Instruction *I, unsigned Depth) -> bool {
5639 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5640 return false;
5641
5642 if (isa<LoadInst>(I))
5643 return true;
5644
5645 return any_of(I->operands(), [&](Value *V) {
5646 auto *I = dyn_cast<Instruction>(V);
5647 return I && DependsOnLoopLoad(I, Depth + 1);
5648 });
5649 };
5650 CmpPredicate Pred;
5651 Instruction *I;
5652 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5653 m_Value())) &&
5654 DependsOnLoopLoad(I, 0)) {
5655 UP.Runtime = true;
5656 }
5657}
5658
5661 OptimizationRemarkEmitter *ORE) const {
5662 // Enable partial unrolling and runtime unrolling.
5663 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5664
5665 UP.UpperBound = true;
5666
5667 // For inner loop, it is more likely to be a hot one, and the runtime check
5668 // can be promoted out from LICM pass, so the overhead is less, let's try
5669 // a larger threshold to unroll more loops.
5670 if (L->getLoopDepth() > 1)
5671 UP.PartialThreshold *= 2;
5672
5673 // Disable partial & runtime unrolling on -Os.
5675
5676 // Scan the loop: don't unroll loops with calls as this could prevent
5677 // inlining. Don't unroll auto-vectorized loops either, though do allow
5678 // unrolling of the scalar remainder.
5679 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5681 for (auto *BB : L->getBlocks()) {
5682 for (auto &I : *BB) {
5683 // Both auto-vectorized loops and the scalar remainder have the
5684 // isvectorized attribute, so differentiate between them by the presence
5685 // of vector instructions.
5686 if (IsVectorized && I.getType()->isVectorTy())
5687 return;
5688 if (isa<CallBase>(I)) {
5691 if (!isLoweredToCall(F))
5692 continue;
5693 return;
5694 }
5695
5696 SmallVector<const Value *, 4> Operands(I.operand_values());
5697 Cost += getInstructionCost(&I, Operands,
5699 }
5700 }
5701
5702 // Apply subtarget-specific unrolling preferences.
5703 if (ST->isAppleMLike())
5704 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5705 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5708
5709 // If this is a small, multi-exit loop similar to something like std::find,
5710 // then there is typically a performance improvement achieved by unrolling.
5711 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5712 UP.RuntimeUnrollMultiExit = true;
5713 UP.Runtime = true;
5714 // Limit unroll count.
5716 // Allow slightly more costly trip-count expansion to catch search loops
5717 // with pointer inductions.
5718 UP.SCEVExpansionBudget = 5;
5719 return;
5720 }
5721
5722 // Enable runtime unrolling for in-order models
5723 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5724 // checking for that case, we can ensure that the default behaviour is
5725 // unchanged
5726 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5727 !ST->getSchedModel().isOutOfOrder()) {
5728 UP.Runtime = true;
5729 UP.Partial = true;
5730 UP.UnrollRemainder = true;
5732
5733 UP.UnrollAndJam = true;
5735 }
5736
5737 // Force unrolling small loops can be very useful because of the branch
5738 // taken cost of the backedge.
5740 UP.Force = true;
5741}
5742
5747
5749 Type *ExpectedType,
5750 bool CanCreate) const {
5751 switch (Inst->getIntrinsicID()) {
5752 default:
5753 return nullptr;
5754 case Intrinsic::aarch64_neon_st1x2:
5755 case Intrinsic::aarch64_neon_st1x3:
5756 case Intrinsic::aarch64_neon_st1x4:
5757 case Intrinsic::aarch64_neon_st2:
5758 case Intrinsic::aarch64_neon_st3:
5759 case Intrinsic::aarch64_neon_st4: {
5760 // Create a struct type
5761 StructType *ST = dyn_cast<StructType>(ExpectedType);
5762 if (!CanCreate || !ST)
5763 return nullptr;
5764 unsigned NumElts = Inst->arg_size() - 1;
5765 if (ST->getNumElements() != NumElts)
5766 return nullptr;
5767 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5768 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5769 return nullptr;
5770 }
5771 Value *Res = PoisonValue::get(ExpectedType);
5772 IRBuilder<> Builder(Inst);
5773 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5774 Value *L = Inst->getArgOperand(i);
5775 Res = Builder.CreateInsertValue(Res, L, i);
5776 }
5777 return Res;
5778 }
5779 case Intrinsic::aarch64_neon_ld1x2:
5780 case Intrinsic::aarch64_neon_ld1x3:
5781 case Intrinsic::aarch64_neon_ld1x4:
5782 case Intrinsic::aarch64_neon_ld2:
5783 case Intrinsic::aarch64_neon_ld3:
5784 case Intrinsic::aarch64_neon_ld4:
5785 if (Inst->getType() == ExpectedType)
5786 return Inst;
5787 return nullptr;
5788 }
5789}
5790
5792 MemIntrinsicInfo &Info) const {
5793 switch (Inst->getIntrinsicID()) {
5794 default:
5795 break;
5796 case Intrinsic::aarch64_neon_ld1x2:
5797 case Intrinsic::aarch64_neon_ld1x3:
5798 case Intrinsic::aarch64_neon_ld1x4:
5799 case Intrinsic::aarch64_neon_ld2:
5800 case Intrinsic::aarch64_neon_ld3:
5801 case Intrinsic::aarch64_neon_ld4:
5802 Info.ReadMem = true;
5803 Info.WriteMem = false;
5804 Info.PtrVal = Inst->getArgOperand(0);
5805 break;
5806 case Intrinsic::aarch64_neon_st1x2:
5807 case Intrinsic::aarch64_neon_st1x3:
5808 case Intrinsic::aarch64_neon_st1x4:
5809 case Intrinsic::aarch64_neon_st2:
5810 case Intrinsic::aarch64_neon_st3:
5811 case Intrinsic::aarch64_neon_st4:
5812 Info.ReadMem = false;
5813 Info.WriteMem = true;
5814 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5815 break;
5816 }
5817
5818 // Use the ID of neon load as the "matching id".
5819 switch (Inst->getIntrinsicID()) {
5820 default:
5821 return false;
5822 case Intrinsic::aarch64_neon_ld1x2:
5823 case Intrinsic::aarch64_neon_st1x2:
5824 Info.MatchingId = Intrinsic::aarch64_neon_ld1x2;
5825 break;
5826 case Intrinsic::aarch64_neon_ld1x3:
5827 case Intrinsic::aarch64_neon_st1x3:
5828 Info.MatchingId = Intrinsic::aarch64_neon_ld1x3;
5829 break;
5830 case Intrinsic::aarch64_neon_ld1x4:
5831 case Intrinsic::aarch64_neon_st1x4:
5832 Info.MatchingId = Intrinsic::aarch64_neon_ld1x4;
5833 break;
5834 case Intrinsic::aarch64_neon_ld2:
5835 case Intrinsic::aarch64_neon_st2:
5836 Info.MatchingId = Intrinsic::aarch64_neon_ld2;
5837 break;
5838 case Intrinsic::aarch64_neon_ld3:
5839 case Intrinsic::aarch64_neon_st3:
5840 Info.MatchingId = Intrinsic::aarch64_neon_ld3;
5841 break;
5842 case Intrinsic::aarch64_neon_ld4:
5843 case Intrinsic::aarch64_neon_st4:
5844 Info.MatchingId = Intrinsic::aarch64_neon_ld4;
5845 break;
5846 }
5847 return true;
5848}
5849
5850/// See if \p I should be considered for address type promotion. We check if \p
5851/// I is a sext with right type and used in memory accesses. If it used in a
5852/// "complex" getelementptr, we allow it to be promoted without finding other
5853/// sext instructions that sign extended the same initial value. A getelementptr
5854/// is considered as "complex" if it has more than 2 operands.
5856 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5857 bool Considerable = false;
5858 AllowPromotionWithoutCommonHeader = false;
5859 if (!isa<SExtInst>(&I))
5860 return false;
5861 Type *ConsideredSExtType =
5862 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5863 if (I.getType() != ConsideredSExtType)
5864 return false;
5865 // See if the sext is the one with the right type and used in at least one
5866 // GetElementPtrInst.
5867 for (const User *U : I.users()) {
5868 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5869 Considerable = true;
5870 // A getelementptr is considered as "complex" if it has more than 2
5871 // operands. We will promote a SExt used in such complex GEP as we
5872 // expect some computation to be merged if they are done on 64 bits.
5873 if (GEPInst->getNumOperands() > 2) {
5874 AllowPromotionWithoutCommonHeader = true;
5875 break;
5876 }
5877 }
5878 }
5879 return Considerable;
5880}
5881
5883 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5884 if (!VF.isScalable())
5885 return true;
5886
5887 Type *Ty = RdxDesc.getRecurrenceType();
5888 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5889 return false;
5890
5891 switch (RdxDesc.getRecurrenceKind()) {
5892 case RecurKind::Sub:
5893 case RecurKind::FSub:
5896 case RecurKind::Add:
5897 case RecurKind::FAdd:
5898 case RecurKind::And:
5899 case RecurKind::Or:
5900 case RecurKind::Xor:
5901 case RecurKind::SMin:
5902 case RecurKind::SMax:
5903 case RecurKind::UMin:
5904 case RecurKind::UMax:
5905 case RecurKind::FMin:
5906 case RecurKind::FMax:
5907 case RecurKind::FMulAdd:
5908 case RecurKind::AnyOf:
5910 return true;
5911 default:
5912 return false;
5913 }
5914}
5915
5918 FastMathFlags FMF,
5920 // The code-generator is currently not able to handle scalable vectors
5921 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5922 // it. This change will be removed when code-generation for these types is
5923 // sufficiently reliable.
5924 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5925 if (VTy->getElementCount() == ElementCount::getScalable(1))
5927
5928 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5929
5930 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5931 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5932
5933 InstructionCost LegalizationCost = 0;
5934 if (LT.first > 1) {
5935 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5936 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5937 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5938 }
5939
5940 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5941}
5942
5944 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5945 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5946 InstructionCost LegalizationCost = 0;
5947 if (LT.first > 1) {
5948 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5949 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5950 LegalizationCost *= LT.first - 1;
5951 }
5952
5953 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5954 assert(ISD && "Invalid opcode");
5955 // Add the final reduction cost for the legal horizontal reduction
5956 switch (ISD) {
5957 case ISD::ADD:
5958 case ISD::AND:
5959 case ISD::OR:
5960 case ISD::XOR:
5961 case ISD::FADD:
5962 return LegalizationCost + 2;
5963 default:
5965 }
5966}
5967
5970 std::optional<FastMathFlags> FMF,
5972 // The code-generator is currently not able to handle scalable vectors
5973 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5974 // it. This change will be removed when code-generation for these types is
5975 // sufficiently reliable.
5976 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5977 if (VTy->getElementCount() == ElementCount::getScalable(1))
5979
5981 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5982 InstructionCost BaseCost =
5983 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5984 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5985 // end up vectorizing for more computationally intensive loops.
5986 return BaseCost + FixedVTy->getNumElements();
5987 }
5988
5989 if (Opcode != Instruction::FAdd || ValTy->getElementType()->isBFloatTy())
5991
5992 auto *VTy = cast<ScalableVectorType>(ValTy);
5994 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5995 Cost *= getMaxNumElements(VTy->getElementCount());
5996 return Cost;
5997 }
5998
5999 if (isa<ScalableVectorType>(ValTy))
6000 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
6001
6002 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
6003 MVT MTy = LT.second;
6004 int ISD = TLI->InstructionOpcodeToISD(Opcode);
6005 assert(ISD && "Invalid opcode");
6006
6007 // Horizontal adds can use the 'addv' instruction. We model the cost of these
6008 // instructions as twice a normal vector add, plus 1 for each legalization
6009 // step (LT.first). This is the only arithmetic vector reduction operation for
6010 // which we have an instruction.
6011 // OR, XOR and AND costs should match the codegen from:
6012 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
6013 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
6014 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
6015 static const CostTblEntry CostTblNoPairwise[]{
6016 {ISD::ADD, MVT::v8i8, 2},
6017 {ISD::ADD, MVT::v16i8, 2},
6018 {ISD::ADD, MVT::v4i16, 2},
6019 {ISD::ADD, MVT::v8i16, 2},
6020 {ISD::ADD, MVT::v2i32, 2},
6021 {ISD::ADD, MVT::v4i32, 2},
6022 {ISD::ADD, MVT::v2i64, 2},
6023 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
6024 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
6025 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
6026 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
6027 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
6028 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
6029 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
6030 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
6031 {ISD::XOR, MVT::v16i8, 7},
6032 {ISD::XOR, MVT::v4i16, 4},
6033 {ISD::XOR, MVT::v8i16, 6},
6034 {ISD::XOR, MVT::v2i32, 3},
6035 {ISD::XOR, MVT::v4i32, 5},
6036 {ISD::XOR, MVT::v2i64, 3},
6037 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
6038 {ISD::AND, MVT::v16i8, 7},
6039 {ISD::AND, MVT::v4i16, 4},
6040 {ISD::AND, MVT::v8i16, 6},
6041 {ISD::AND, MVT::v2i32, 3},
6042 {ISD::AND, MVT::v4i32, 5},
6043 {ISD::AND, MVT::v2i64, 3},
6044 };
6045 switch (ISD) {
6046 default:
6047 break;
6048 case ISD::FADD:
6049 if (Type *EltTy = ValTy->getScalarType();
6050 // FIXME: For half types without fullfp16 support, this could extend and
6051 // use a fp32 faddp reduction but current codegen unrolls.
6052 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
6053 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
6054 const unsigned NElts = MTy.getVectorNumElements();
6055 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
6056 isPowerOf2_32(NElts))
6057 // Reduction corresponding to series of fadd instructions is lowered to
6058 // series of faddp instructions. faddp has latency/throughput that
6059 // matches fadd instruction and hence, every faddp instruction can be
6060 // considered to have a relative cost = 1 with
6061 // CostKind = TCK_RecipThroughput.
6062 // An faddp will pairwise add vector elements, so the size of input
6063 // vector reduces by half every time, requiring
6064 // #(faddp instructions) = log2_32(NElts).
6065 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
6066 }
6067 break;
6068 case ISD::ADD:
6069 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
6070 return (LT.first - 1) + Entry->Cost;
6071 break;
6072 case ISD::XOR:
6073 case ISD::AND:
6074 case ISD::OR:
6075 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
6076 if (!Entry)
6077 break;
6078 auto *ValVTy = cast<FixedVectorType>(ValTy);
6079 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
6080 isPowerOf2_32(ValVTy->getNumElements())) {
6081 InstructionCost ExtraCost = 0;
6082 if (LT.first != 1) {
6083 // Type needs to be split, so there is an extra cost of LT.first - 1
6084 // arithmetic ops.
6085 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
6086 MTy.getVectorNumElements());
6087 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
6088 ExtraCost *= LT.first - 1;
6089 }
6090 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
6091 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
6092 return Cost + ExtraCost;
6093 }
6094 break;
6095 }
6096 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
6097}
6098
6100 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
6101 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
6102 EVT VecVT = TLI->getValueType(DL, VecTy);
6103 EVT ResVT = TLI->getValueType(DL, ResTy);
6104
6105 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
6106 VecVT.getSizeInBits() >= 64) {
6107 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
6108
6109 // The legal cases are:
6110 // UADDLV 8/16/32->32
6111 // UADDLP 32->64
6112 unsigned RevVTSize = ResVT.getSizeInBits();
6113 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6114 RevVTSize <= 32) ||
6115 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
6116 RevVTSize <= 32) ||
6117 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
6118 RevVTSize <= 64))
6119 return (LT.first - 1) * 2 + 2;
6120 }
6121
6122 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
6123 CostKind);
6124}
6125
6127AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
6128 Type *ResTy, VectorType *VecTy,
6130 EVT VecVT = TLI->getValueType(DL, VecTy);
6131 EVT ResVT = TLI->getValueType(DL, ResTy);
6132
6133 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
6134 RedOpcode == Instruction::Add) {
6135 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
6136
6137 // The legal cases with dotprod are
6138 // UDOT 8->32
6139 // Which requires an additional uaddv to sum the i32 values.
6140 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6141 ResVT == MVT::i32)
6142 return LT.first + 2;
6143 }
6144
6145 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
6146 CostKind);
6147}
6148
6152 static const CostTblEntry ShuffleTbl[] = {
6153 { TTI::SK_Splice, MVT::nxv16i8, 1 },
6154 { TTI::SK_Splice, MVT::nxv8i16, 1 },
6155 { TTI::SK_Splice, MVT::nxv4i32, 1 },
6156 { TTI::SK_Splice, MVT::nxv2i64, 1 },
6157 { TTI::SK_Splice, MVT::nxv2f16, 1 },
6158 { TTI::SK_Splice, MVT::nxv4f16, 1 },
6159 { TTI::SK_Splice, MVT::nxv8f16, 1 },
6160 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
6161 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
6162 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
6163 { TTI::SK_Splice, MVT::nxv2f32, 1 },
6164 { TTI::SK_Splice, MVT::nxv4f32, 1 },
6165 { TTI::SK_Splice, MVT::nxv2f64, 1 },
6166 };
6167
6168 // The code-generator is currently not able to handle scalable vectors
6169 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
6170 // it. This change will be removed when code-generation for these types is
6171 // sufficiently reliable.
6174
6175 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
6176 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
6177 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
6178 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
6179 : LT.second;
6180 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
6181 InstructionCost LegalizationCost = 0;
6182 if (Index < 0) {
6183 LegalizationCost =
6184 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
6186 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
6188 }
6189
6190 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
6191 // Cost performed on a promoted type.
6192 if (LT.second.getScalarType() == MVT::i1) {
6193 LegalizationCost +=
6194 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
6196 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
6198 }
6199 const auto *Entry =
6200 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
6201 assert(Entry && "Illegal Type for Splice");
6202 LegalizationCost += Entry->Cost;
6203 return LegalizationCost * LT.first;
6204}
6205
6207 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
6209 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
6210 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
6212
6214 return Invalid;
6215
6216 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
6217 Opcode != Instruction::FAdd && Opcode != Instruction::FSub) ||
6218 OpAExtend == TTI::PR_None)
6219 return Invalid;
6220
6221 // Floating-point partial reductions are invalid if `reassoc` and `contract`
6222 // are not allowed.
6223 if (AccumType->isFloatingPointTy()) {
6224 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
6225 if (!FMF->allowReassoc() || !FMF->allowContract())
6226 return Invalid;
6227 } else {
6228 assert(!FMF &&
6229 "FastMathFlags only apply to floating-point partial reductions");
6230 }
6231
6232 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
6233 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
6234 "Unexpected values for OpBExtend or InputTypeB");
6235
6236 // We only support multiply binary operations for now, and for muls we
6237 // require the types being extended to be the same.
6238 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6239 InputTypeA != InputTypeB))
6240 return Invalid;
6241
6242 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6243 // USDot is natively supported with +i8mm. With plain +dotprod, SUMLA is
6244 // lowered to two udots plus an eor and a sub.
6245 if (IsUSDot && !ST->hasMatMulInt8() && !ST->hasDotProd())
6246 // FIXME: Remove this early bailout in favour of expand cost.
6247 return Invalid;
6248
6249 unsigned Ratio =
6250 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6251 if (VF.getKnownMinValue() <= Ratio)
6252 return Invalid;
6253
6254 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6255 VectorType *AccumVectorType =
6256 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6257 // We don't yet support all kinds of legalization.
6258 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6259 EVT::getEVT(AccumVectorType));
6260 switch (TC.first) {
6261 default:
6262 return Invalid;
6266 // The legalised type (e.g. after splitting) must be legal too.
6267 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6269 return Invalid;
6270 break;
6271 }
6272
6273 std::pair<InstructionCost, MVT> AccumLT =
6274 getTypeLegalizationCost(AccumVectorType);
6275 std::pair<InstructionCost, MVT> InputLT =
6276 getTypeLegalizationCost(InputVectorType);
6277
6278 // Returns true if the subtarget supports the operation for a given type.
6279 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6280 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6281 (AccumLT.second.isFixedLengthVector() &&
6282 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6283 NEONPred);
6284 };
6285
6286 bool IsSub = Opcode == Instruction::Sub || Opcode == Instruction::FSub;
6287 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6288 // Integer partial sub-reductions that don't map to a specific instruction,
6289 // carry an extra cost for implementing a double negation:
6290 // partial_reduce_umls acc, lhs, rhs
6291 // <=> -partial_reduce_umla -acc, lhs, rhs
6292 InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0;
6293
6294 if (AccumLT.second.getScalarType() == MVT::i32 &&
6295 InputLT.second.getScalarType() == MVT::i8) {
6296 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6297 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6298 return Cost + INegCost;
6299 // i8 -> i32 usdot requires +i8mm
6300 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6301 return Cost + INegCost;
6302 // Without +i8mm, lower SUMLA via two udots plus an eor and a sub on plain
6303 // +dotprod targets. Note that this is only implemented for NEON, as all
6304 // modern CPUs with SVE also have +i8mm. Charge an extra factor for the
6305 // expansion.
6306 if (IsUSDot && IsSupported(false, ST->hasDotProd()))
6307 return Cost * 3 + INegCost;
6308 }
6309
6310 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6311 // i16 -> i64 is natively supported for udot/sdot
6312 if (AccumLT.second.getScalarType() == MVT::i64 &&
6313 InputLT.second.getScalarType() == MVT::i16)
6314 return Cost + INegCost;
6315 // i16 -> i32 is natively supported with SVE2p1 udot/sdot.
6316 // For sub-reductions, we prefer using the *mlslb/t instructions.
6317 if (AccumLT.second.getScalarType() == MVT::i32 &&
6318 InputLT.second.getScalarType() == MVT::i16 &&
6319 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6320 return Cost;
6321 // i8 -> i64 is supported with an extra level of extends
6322 if (AccumLT.second.getScalarType() == MVT::i64 &&
6323 InputLT.second.getScalarType() == MVT::i8)
6324 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6325 // because it requires two extra extends on the inputs. But if we'd change
6326 // that now, a regular reduction would be cheaper because the costs of
6327 // the extends in the IR are still counted. This can be fixed
6328 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6329 return Cost + INegCost;
6330 // i8 -> i16 is natively supported with SVE2p3 udot/sdot
6331 // For sub-reductions, we prefer using the *mlslb/t instructions.
6332 if (AccumLT.second.getScalarType() == MVT::i16 &&
6333 InputLT.second.getScalarType() == MVT::i8 &&
6334 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6335 return Cost;
6336 }
6337
6338 // f16 -> f32 is natively supported for fdot using either
6339 // SVE or NEON instruction.
6340 if (Opcode == Instruction::FAdd && !IsSub &&
6341 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6342 AccumLT.second.getScalarType() == MVT::f32 &&
6343 InputLT.second.getScalarType() == MVT::f16)
6344 return Cost;
6345
6346 // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions.
6347 if (Ratio == 2 && !IsUSDot) {
6348 MVT InVT = InputLT.second.getScalarType();
6349
6350 // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2)
6351 if (IsSupported(ST->hasSVE2() || ST->hasSME(), true) &&
6352 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6353 return Cost * 2;
6354
6355 // SVE2 fml[as]lb/t and NEON fml[as]l(2)
6356 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6357 return Cost * 2;
6358
6359 // SME2/SVE2p1 bfmlslb/t
6360 if (IsSupported(ST->hasSVE2p1() || ST->hasSME2(), false) &&
6361 InVT == MVT::bf16 && IsSub)
6362 return Cost * 2;
6363
6364 // FP partial sub-reductions that don't map to a specific instruction,
6365 // carry an extra cost for implementing an extra negation:
6366 // partial_reduce_fmls acc, lhs, rhs
6367 // <=> partial_reduce_fmla acc, lhs, -rhs
6368 InstructionCost FNegCost = IsSub ? InputLT.first * TTI::TCC_Basic : 0;
6369
6370 // SVE and NEON bfmlalb/t
6371 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6372 return Cost * 2 + FNegCost;
6373 }
6374
6375 return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
6376 AccumType, VF, OpAExtend, OpBExtend,
6377 BinOp, CostKind, FMF);
6378}
6379
6382 VectorType *SrcTy, ArrayRef<int> Mask,
6383 TTI::TargetCostKind CostKind, int Index,
6385 const Instruction *CxtI) const {
6386 assert((Mask.empty() || DstTy->isScalableTy() ||
6387 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6388 "Expected the Mask to match the return size if given");
6389 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6390 "Expected the same scalar types");
6391 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6392
6393 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6394 // into smaller vectors and sum the cost of each shuffle.
6395 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6396 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6397 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6398 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6399 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6400 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6401 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6402 // cost than just the load.
6403 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6406 return std::max<InstructionCost>(1, LT.first / 4);
6407
6408 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6409 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6410 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6411 // cost than just the store.
6412 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6414 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6416 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6417 return LT.first;
6418
6419 unsigned TpNumElts = Mask.size();
6420 unsigned LTNumElts = LT.second.getVectorNumElements();
6421 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6422 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6423 LT.second.getVectorElementCount());
6425 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6426 PreviousCosts;
6427 for (unsigned N = 0; N < NumVecs; N++) {
6428 SmallVector<int> NMask;
6429 // Split the existing mask into chunks of size LTNumElts. Track the source
6430 // sub-vectors to ensure the result has at most 2 inputs.
6431 unsigned Source1 = -1U, Source2 = -1U;
6432 unsigned NumSources = 0;
6433 for (unsigned E = 0; E < LTNumElts; E++) {
6434 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6436 if (MaskElt < 0) {
6438 continue;
6439 }
6440
6441 // Calculate which source from the input this comes from and whether it
6442 // is new to us.
6443 unsigned Source = MaskElt / LTNumElts;
6444 if (NumSources == 0) {
6445 Source1 = Source;
6446 NumSources = 1;
6447 } else if (NumSources == 1 && Source != Source1) {
6448 Source2 = Source;
6449 NumSources = 2;
6450 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6451 NumSources++;
6452 }
6453
6454 // Add to the new mask. For the NumSources>2 case these are not correct,
6455 // but are only used for the modular lane number.
6456 if (Source == Source1)
6457 NMask.push_back(MaskElt % LTNumElts);
6458 else if (Source == Source2)
6459 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6460 else
6461 NMask.push_back(MaskElt % LTNumElts);
6462 }
6463 // Check if we have already generated this sub-shuffle, which means we
6464 // will have already generated the output. For example a <16 x i32> splat
6465 // will be the same sub-splat 4 times, which only needs to be generated
6466 // once and reused.
6467 auto Result =
6468 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6469 // Check if it was already in the map (already costed).
6470 if (!Result.second)
6471 continue;
6472 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6473 // getShuffleCost. If not then cost it using the worst case as the number
6474 // of element moves into a new vector.
6475 InstructionCost NCost =
6476 NumSources <= 2
6477 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6479 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6480 CxtI)
6481 : LTNumElts;
6482 Result.first->second = NCost;
6483 Cost += NCost;
6484 }
6485 return Cost;
6486 }
6487
6488 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6489 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6490 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6491 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6492 // This currently only handles low or high extracts to prevent SLP vectorizer
6493 // regressions.
6494 // Note that SVE's ext instruction is destructive, but it can be fused with
6495 // a movprfx to act like a constructive instruction.
6496 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6497 if (LT.second.getFixedSizeInBits() >= 128 &&
6498 cast<FixedVectorType>(SubTp)->getNumElements() ==
6499 LT.second.getVectorNumElements() / 2) {
6500 if (Index == 0)
6501 return 0;
6502 if (Index == (int)LT.second.getVectorNumElements() / 2)
6503 return 1;
6504 }
6506 }
6507 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6508 // the code to handle length-changing shuffles.
6509 if (Kind == TTI::SK_InsertSubvector) {
6510 LT = getTypeLegalizationCost(DstTy);
6511 SrcTy = DstTy;
6512 }
6513
6514 // Check for identity masks, which we can treat as free for both fixed and
6515 // scalable vector paths.
6516 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6517 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6518 all_of(enumerate(Mask), [](const auto &M) {
6519 return M.value() < 0 || M.value() == (int)M.index();
6520 }))
6521 return 0;
6522
6523 // Segmented shuffle matching.
6524 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6525 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6526 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6528
6530 unsigned Segments =
6532 unsigned SegmentElts = VTy->getNumElements() / Segments;
6533
6534 // dupq zd.t, zn.t[idx]
6535 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6536 ST->isSVEorStreamingSVEAvailable() &&
6537 isDUPQMask(Mask, Segments, SegmentElts))
6538 return LT.first;
6539
6540 // mov zd.q, vn
6541 if (ST->isSVEorStreamingSVEAvailable() &&
6542 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6543 return LT.first;
6544 }
6545
6546 // Check for broadcast loads, which are supported by the LD1R instruction.
6547 // In terms of code-size, the shuffle vector is free when a load + dup get
6548 // folded into a LD1R. That's what we check and return here. For performance
6549 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6550 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6551 // that we model the load + dup sequence slightly higher because LD1R is a
6552 // high latency instruction.
6553 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6554 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6555 if (IsLoad && LT.second.isVector() &&
6556 isLegalBroadcastLoad(SrcTy->getElementType(),
6557 LT.second.getVectorElementCount()))
6558 return 0;
6559 }
6560
6561 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6562 // from the perfect shuffle tables.
6563 if (Mask.size() == 4 &&
6564 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6565 (SrcTy->getScalarSizeInBits() == 16 ||
6566 SrcTy->getScalarSizeInBits() == 32) &&
6567 all_of(Mask, [](int E) { return E < 8; }))
6568 return getPerfectShuffleCost(Mask);
6569
6570 // Check for other shuffles that are not SK_ kinds but we have native
6571 // instructions for, for example ZIP and UZP.
6572 unsigned Unused;
6573 if (LT.second.isFixedLengthVector() &&
6574 LT.second.getVectorNumElements() == Mask.size() &&
6575 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6576 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6577 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6578 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6579 Kind == TTI::SK_InsertSubvector) &&
6580 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6581 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6582 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6583 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6584 LT.second.getVectorNumElements(), 16) ||
6585 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6586 LT.second.getVectorNumElements(), 32) ||
6587 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6588 LT.second.getVectorNumElements(), 64) ||
6589 // Check for non-zero lane splats
6590 all_of(drop_begin(Mask),
6591 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6592 return 1;
6593
6594 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6595 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6596 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6597 static const CostTblEntry ShuffleTbl[] = {
6598 // Broadcast shuffle kinds can be performed with 'dup'.
6599 {TTI::SK_Broadcast, MVT::v8i8, 1},
6600 {TTI::SK_Broadcast, MVT::v16i8, 1},
6601 {TTI::SK_Broadcast, MVT::v4i16, 1},
6602 {TTI::SK_Broadcast, MVT::v8i16, 1},
6603 {TTI::SK_Broadcast, MVT::v2i32, 1},
6604 {TTI::SK_Broadcast, MVT::v4i32, 1},
6605 {TTI::SK_Broadcast, MVT::v2i64, 1},
6606 {TTI::SK_Broadcast, MVT::v4f16, 1},
6607 {TTI::SK_Broadcast, MVT::v8f16, 1},
6608 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6609 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6610 {TTI::SK_Broadcast, MVT::v2f32, 1},
6611 {TTI::SK_Broadcast, MVT::v4f32, 1},
6612 {TTI::SK_Broadcast, MVT::v2f64, 1},
6613 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6614 // 'zip1/zip2' instructions.
6615 {TTI::SK_Transpose, MVT::v8i8, 1},
6616 {TTI::SK_Transpose, MVT::v16i8, 1},
6617 {TTI::SK_Transpose, MVT::v4i16, 1},
6618 {TTI::SK_Transpose, MVT::v8i16, 1},
6619 {TTI::SK_Transpose, MVT::v2i32, 1},
6620 {TTI::SK_Transpose, MVT::v4i32, 1},
6621 {TTI::SK_Transpose, MVT::v2i64, 1},
6622 {TTI::SK_Transpose, MVT::v4f16, 1},
6623 {TTI::SK_Transpose, MVT::v8f16, 1},
6624 {TTI::SK_Transpose, MVT::v4bf16, 1},
6625 {TTI::SK_Transpose, MVT::v8bf16, 1},
6626 {TTI::SK_Transpose, MVT::v2f32, 1},
6627 {TTI::SK_Transpose, MVT::v4f32, 1},
6628 {TTI::SK_Transpose, MVT::v2f64, 1},
6629 // Select shuffle kinds.
6630 // TODO: handle vXi8/vXi16.
6631 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6632 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6633 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6634 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6635 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6636 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6637 // PermuteSingleSrc shuffle kinds.
6638 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6639 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6640 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6641 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6642 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6643 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6644 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6645 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6646 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6647 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6648 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6649 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6650 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6651 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6652 // Reverse can be lowered with `rev`.
6653 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6654 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6655 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6656 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6657 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6658 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6659 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6660 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6661 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6662 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6663 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6664 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6665 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6666 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6667 // Splice can all be lowered as `ext`.
6668 {TTI::SK_Splice, MVT::v2i32, 1},
6669 {TTI::SK_Splice, MVT::v4i32, 1},
6670 {TTI::SK_Splice, MVT::v2i64, 1},
6671 {TTI::SK_Splice, MVT::v2f32, 1},
6672 {TTI::SK_Splice, MVT::v4f32, 1},
6673 {TTI::SK_Splice, MVT::v2f64, 1},
6674 {TTI::SK_Splice, MVT::v8f16, 1},
6675 {TTI::SK_Splice, MVT::v8bf16, 1},
6676 {TTI::SK_Splice, MVT::v8i16, 1},
6677 {TTI::SK_Splice, MVT::v16i8, 1},
6678 {TTI::SK_Splice, MVT::v4f16, 1},
6679 {TTI::SK_Splice, MVT::v4bf16, 1},
6680 {TTI::SK_Splice, MVT::v4i16, 1},
6681 {TTI::SK_Splice, MVT::v8i8, 1},
6682 // Broadcast shuffle kinds for scalable vectors
6683 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6684 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6685 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6686 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6687 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6688 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6689 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6690 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6691 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6692 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6693 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6694 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6695 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6696 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6697 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6698 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6699 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6700 // Handle the cases for vector.reverse with scalable vectors
6701 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6702 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6703 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6704 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6705 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6706 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6707 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6708 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6709 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6710 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6711 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6712 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6713 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6714 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6715 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6716 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6717 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6718 };
6719 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6720 return LT.first * Entry->Cost;
6721 }
6722
6723 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6724 return getSpliceCost(SrcTy, Index, CostKind);
6725
6726 // Inserting a subvector can often be done with either a D, S or H register
6727 // move, so long as the inserted vector is "aligned".
6728 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6729 LT.second.getSizeInBits() <= 128 && SubTp) {
6730 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6731 if (SubLT.second.isVector()) {
6732 int NumElts = LT.second.getVectorNumElements();
6733 int NumSubElts = SubLT.second.getVectorNumElements();
6734 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6735 return SubLT.first;
6736 }
6737 }
6738
6739 // Restore optimal kind.
6740 if (IsExtractSubvector)
6742 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6743 Args, CxtI);
6744}
6745
6748 const DominatorTree &DT) {
6749 const auto &Strides = DenseMap<Value *, const SCEV *>();
6750 for (BasicBlock *BB : TheLoop->blocks()) {
6751 // Scan the instructions in the block and look for addresses that are
6752 // consecutive and decreasing.
6753 for (Instruction &I : *BB) {
6754 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6756 Type *AccessTy = getLoadStoreType(&I);
6757 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6758 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6759 .value_or(0) < 0)
6760 return true;
6761 }
6762 }
6763 }
6764 return false;
6765}
6766
6768 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6770 // For cases like post-LTO vectorization, when we eventually know the trip
6771 // count, epilogue with fixed-width vectorization can be deleted if the trip
6772 // count is less than the epilogue iterations. That's why we prefer
6773 // fixed-width vectorization in epilogue in case of equal costs.
6774 if (IsEpilogue)
6775 return true;
6776 return ST->useFixedOverScalableIfEqualCost();
6777}
6778
6780 return ST->getEpilogueVectorizationMinVF();
6781}
6782
6784 if (!ST->hasSVE())
6785 return false;
6786
6787 // We don't currently support vectorisation with interleaving for SVE - with
6788 // such loops we're better off not using tail-folding. This gives us a chance
6789 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6790 if (TFI->IAI->hasGroups())
6791 return false;
6792
6794 if (TFI->LVL->getReductionVars().size())
6795 Required |= TailFoldingOpts::Reductions;
6796 if (TFI->LVL->getFixedOrderRecurrences().size())
6797 Required |= TailFoldingOpts::Recurrences;
6798
6799 // We call this to discover whether any load/store pointers in the loop have
6800 // negative strides. This will require extra work to reverse the loop
6801 // predicate, which may be expensive.
6804 *TFI->LVL->getDominatorTree()))
6805 Required |= TailFoldingOpts::Reverse;
6806 if (Required == TailFoldingOpts::Disabled)
6807 Required |= TailFoldingOpts::Simple;
6808
6809 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6810 Required))
6811 return false;
6812
6813 // Don't tail-fold for tight loops where we would be better off interleaving
6814 // with an unpredicated loop.
6815 unsigned NumInsns = 0;
6816 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6817 NumInsns += BB->size();
6818 }
6819
6820 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6821 return NumInsns >= SVETailFoldInsnThreshold;
6822}
6823
6826 StackOffset BaseOffset, bool HasBaseReg,
6827 int64_t Scale, unsigned AddrSpace) const {
6828 // Scaling factors are not free at all.
6829 // Operands | Rt Latency
6830 // -------------------------------------------
6831 // Rt, [Xn, Xm] | 4
6832 // -------------------------------------------
6833 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6834 // Rt, [Xn, Wm, <extend> #imm] |
6836 AM.BaseGV = BaseGV;
6837 AM.BaseOffs = BaseOffset.getFixed();
6838 AM.HasBaseReg = HasBaseReg;
6839 AM.Scale = Scale;
6840 AM.ScalableOffset = BaseOffset.getScalable();
6841 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6842 // Scale represents reg2 * scale, thus account for 1 if
6843 // it is not equal to 0 or 1.
6844 return AM.Scale != 0 && AM.Scale != 1;
6846}
6847
6849 const Instruction *I) const {
6851 // For the binary operators (e.g. or) we need to be more careful than
6852 // selects, here we only transform them if they are already at a natural
6853 // break point in the code - the end of a block with an unconditional
6854 // terminator.
6855 if (I->getOpcode() == Instruction::Or &&
6856 isa<UncondBrInst>(I->getNextNode()))
6857 return true;
6858
6859 if (I->getOpcode() == Instruction::Add ||
6860 I->getOpcode() == Instruction::Sub)
6861 return true;
6862 }
6864}
6865
6868 const TargetTransformInfo::LSRCost &C2) const {
6869 // AArch64 specific here is adding the number of instructions to the
6870 // comparison (though not as the first consideration, as some targets do)
6871 // along with changing the priority of the base additions.
6872 // TODO: Maybe a more nuanced tradeoff between instruction count
6873 // and number of registers? To be investigated at a later date.
6874 if (EnableLSRCostOpt)
6875 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6876 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6877 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6878 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6879
6881}
6882
6883static bool isSplatShuffle(Value *V) {
6884 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6885 return all_equal(Shuf->getShuffleMask());
6886 return false;
6887}
6888
6889/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6890/// or upper half of the vector elements.
6891static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6892 bool AllowSplat = false) {
6893 // Scalable types can't be extract shuffle vectors.
6894 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6895 return false;
6896
6897 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6898 auto *FullTy = FullV->getType();
6899 auto *HalfTy = HalfV->getType();
6900 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6901 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6902 };
6903
6904 auto extractHalf = [](Value *FullV, Value *HalfV) {
6905 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6906 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6907 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6908 };
6909
6910 ArrayRef<int> M1, M2;
6911 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6912 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6913 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6914 return false;
6915
6916 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6917 // it is not checked as an extract below.
6918 if (AllowSplat && isSplatShuffle(Op1))
6919 S1Op1 = nullptr;
6920 if (AllowSplat && isSplatShuffle(Op2))
6921 S2Op1 = nullptr;
6922
6923 // Check that the operands are half as wide as the result and we extract
6924 // half of the elements of the input vectors.
6925 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6926 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6927 return false;
6928
6929 // Check the mask extracts either the lower or upper half of vector
6930 // elements.
6931 int M1Start = 0;
6932 int M2Start = 0;
6933 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6934 if ((S1Op1 &&
6935 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6936 (S2Op1 &&
6937 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6938 return false;
6939
6940 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6941 (M2Start != 0 && M2Start != (NumElements / 2)))
6942 return false;
6943 if (S1Op1 && S2Op1 && M1Start != M2Start)
6944 return false;
6945
6946 return true;
6947}
6948
6949/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6950/// of the vector elements.
6951static bool areExtractExts(Value *Ext1, Value *Ext2) {
6952 auto areExtDoubled = [](Instruction *Ext) {
6953 return Ext->getType()->getScalarSizeInBits() ==
6954 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6955 };
6956
6957 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6958 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6959 !areExtDoubled(cast<Instruction>(Ext1)) ||
6960 !areExtDoubled(cast<Instruction>(Ext2)))
6961 return false;
6962
6963 return true;
6964}
6965
6966/// Check if Op could be used with vmull_high_p64 intrinsic.
6968 Value *VectorOperand = nullptr;
6969 ConstantInt *ElementIndex = nullptr;
6970 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6971 m_ConstantInt(ElementIndex))) &&
6972 ElementIndex->getValue() == 1 &&
6973 isa<FixedVectorType>(VectorOperand->getType()) &&
6974 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6975}
6976
6977/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6978static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6980}
6981
6983 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6984 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6985 if (!GEP || GEP->getNumOperands() != 2)
6986 return false;
6987
6988 Value *Base = GEP->getOperand(0);
6989 Value *Offsets = GEP->getOperand(1);
6990
6991 // We only care about scalar_base+vector_offsets.
6992 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6993 return false;
6994
6995 // Sink extends that would allow us to use 32-bit offset vectors.
6996 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6997 auto *OffsetsInst = cast<Instruction>(Offsets);
6998 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6999 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
7000 Ops.push_back(&GEP->getOperandUse(1));
7001 }
7002
7003 // Sink the GEP.
7004 return true;
7005}
7006
7007/// We want to sink following cases:
7008/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
7009/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
7011 if (match(Op, m_VScale()))
7012 return true;
7013 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
7015 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7016 return true;
7017 }
7018 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
7020 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
7021 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
7022 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7023 return true;
7024 }
7025 return false;
7026}
7027
7028static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
7029
7030/// Check if sinking \p I's operands to I's basic block is profitable, because
7031/// the operands can be folded into a target instruction, e.g.
7032/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
7036 switch (II->getIntrinsicID()) {
7037 case Intrinsic::aarch64_neon_smull:
7038 case Intrinsic::aarch64_neon_umull:
7039 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
7040 /*AllowSplat=*/true)) {
7041 Ops.push_back(&II->getOperandUse(0));
7042 Ops.push_back(&II->getOperandUse(1));
7043 return true;
7044 }
7045 [[fallthrough]];
7046
7047 case Intrinsic::fma:
7048 case Intrinsic::fmuladd:
7049 if (isa<VectorType>(I->getType()) &&
7050 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7051 !ST->hasFullFP16())
7052 return false;
7053
7054 if (isFNeg(II->getOperand(0)))
7055 Ops.push_back(&II->getOperandUse(0));
7056 if (isFNeg(II->getOperand(1)))
7057 Ops.push_back(&II->getOperandUse(1));
7058
7059 [[fallthrough]];
7060 case Intrinsic::aarch64_neon_sqdmull:
7061 case Intrinsic::aarch64_neon_sqdmulh:
7062 case Intrinsic::aarch64_neon_sqrdmulh:
7063 // Sink splats for index lane variants
7064 if (isSplatShuffle(II->getOperand(0)))
7065 Ops.push_back(&II->getOperandUse(0));
7066 if (isSplatShuffle(II->getOperand(1)))
7067 Ops.push_back(&II->getOperandUse(1));
7068 return !Ops.empty();
7069 case Intrinsic::aarch64_neon_fmlal:
7070 case Intrinsic::aarch64_neon_fmlal2:
7071 case Intrinsic::aarch64_neon_fmlsl:
7072 case Intrinsic::aarch64_neon_fmlsl2:
7073 // Sink splats for index lane variants
7074 if (isSplatShuffle(II->getOperand(1)))
7075 Ops.push_back(&II->getOperandUse(1));
7076 if (isSplatShuffle(II->getOperand(2)))
7077 Ops.push_back(&II->getOperandUse(2));
7078 return !Ops.empty();
7079 case Intrinsic::aarch64_sve_ptest_first:
7080 case Intrinsic::aarch64_sve_ptest_last:
7081 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
7082 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
7083 Ops.push_back(&II->getOperandUse(0));
7084 return !Ops.empty();
7085 case Intrinsic::aarch64_sme_write_horiz:
7086 case Intrinsic::aarch64_sme_write_vert:
7087 case Intrinsic::aarch64_sme_writeq_horiz:
7088 case Intrinsic::aarch64_sme_writeq_vert: {
7089 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
7090 if (!Idx || Idx->getOpcode() != Instruction::Add)
7091 return false;
7092 Ops.push_back(&II->getOperandUse(1));
7093 return true;
7094 }
7095 case Intrinsic::aarch64_sme_read_horiz:
7096 case Intrinsic::aarch64_sme_read_vert:
7097 case Intrinsic::aarch64_sme_readq_horiz:
7098 case Intrinsic::aarch64_sme_readq_vert:
7099 case Intrinsic::aarch64_sme_ld1b_vert:
7100 case Intrinsic::aarch64_sme_ld1h_vert:
7101 case Intrinsic::aarch64_sme_ld1w_vert:
7102 case Intrinsic::aarch64_sme_ld1d_vert:
7103 case Intrinsic::aarch64_sme_ld1q_vert:
7104 case Intrinsic::aarch64_sme_st1b_vert:
7105 case Intrinsic::aarch64_sme_st1h_vert:
7106 case Intrinsic::aarch64_sme_st1w_vert:
7107 case Intrinsic::aarch64_sme_st1d_vert:
7108 case Intrinsic::aarch64_sme_st1q_vert:
7109 case Intrinsic::aarch64_sme_ld1b_horiz:
7110 case Intrinsic::aarch64_sme_ld1h_horiz:
7111 case Intrinsic::aarch64_sme_ld1w_horiz:
7112 case Intrinsic::aarch64_sme_ld1d_horiz:
7113 case Intrinsic::aarch64_sme_ld1q_horiz:
7114 case Intrinsic::aarch64_sme_st1b_horiz:
7115 case Intrinsic::aarch64_sme_st1h_horiz:
7116 case Intrinsic::aarch64_sme_st1w_horiz:
7117 case Intrinsic::aarch64_sme_st1d_horiz:
7118 case Intrinsic::aarch64_sme_st1q_horiz: {
7119 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
7120 if (!Idx || Idx->getOpcode() != Instruction::Add)
7121 return false;
7122 Ops.push_back(&II->getOperandUse(3));
7123 return true;
7124 }
7125 case Intrinsic::aarch64_neon_pmull:
7126 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
7127 return false;
7128 Ops.push_back(&II->getOperandUse(0));
7129 Ops.push_back(&II->getOperandUse(1));
7130 return true;
7131 case Intrinsic::aarch64_neon_pmull64:
7132 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
7133 II->getArgOperand(1)))
7134 return false;
7135 Ops.push_back(&II->getArgOperandUse(0));
7136 Ops.push_back(&II->getArgOperandUse(1));
7137 return true;
7138 case Intrinsic::masked_gather:
7139 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
7140 return false;
7141 Ops.push_back(&II->getArgOperandUse(0));
7142 return true;
7143 case Intrinsic::masked_scatter:
7144 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
7145 return false;
7146 Ops.push_back(&II->getArgOperandUse(1));
7147 return true;
7148 default:
7149 return false;
7150 }
7151 }
7152
7153 auto ShouldSinkCondition = [](Value *Cond,
7154 SmallVectorImpl<Use *> &Ops) -> bool {
7156 return false;
7158 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
7159 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
7160 return false;
7161 if (isa<CmpInst>(II->getOperand(0)))
7162 Ops.push_back(&II->getOperandUse(0));
7163 return true;
7164 };
7165
7166 switch (I->getOpcode()) {
7167 case Instruction::GetElementPtr:
7168 case Instruction::Add:
7169 case Instruction::Sub:
7170 // Sink vscales closer to uses for better isel
7171 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
7172 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
7173 Ops.push_back(&I->getOperandUse(Op));
7174 return true;
7175 }
7176 }
7177 break;
7178 case Instruction::Select: {
7179 if (!ShouldSinkCondition(I->getOperand(0), Ops))
7180 return false;
7181
7182 Ops.push_back(&I->getOperandUse(0));
7183 return true;
7184 }
7185 case Instruction::UncondBr:
7186 return false;
7187 case Instruction::CondBr: {
7188 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
7189 return false;
7190
7191 Ops.push_back(&I->getOperandUse(0));
7192 return true;
7193 }
7194 case Instruction::FMul:
7195 // fmul with contract flag can be combined with fadd into fma.
7196 // Sinking fneg into this block enables fmls pattern.
7197 if (cast<FPMathOperator>(I)->hasAllowContract()) {
7198 if (isFNeg(I->getOperand(0)))
7199 Ops.push_back(&I->getOperandUse(0));
7200 if (isFNeg(I->getOperand(1)))
7201 Ops.push_back(&I->getOperandUse(1));
7202 }
7203 break;
7204
7205 // Type | BIC | ORN | EON
7206 // ----------------+-----------+-----------+-----------
7207 // scalar | Base | Base | Base
7208 // scalar w/shift | - | - | -
7209 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
7210 // scalable vector | SVE | - | BSL2N
7211 case Instruction::Xor:
7212 // EON only for scalars (possibly expanded fixed vectors)
7213 // and vectors using the SVE2/SME BSL2N instruction.
7214 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
7215 bool HasBSL2N =
7216 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
7217 if (!HasBSL2N)
7218 break;
7219 }
7220 [[fallthrough]];
7221 case Instruction::And:
7222 case Instruction::Or:
7223 // Even though we could use the SVE2/SME BSL2N instruction,
7224 // it might pessimize with an extra MOV depending on register allocation.
7225 if (I->getOpcode() == Instruction::Or &&
7226 isa<ScalableVectorType>(I->getType()))
7227 break;
7228 // Shift can be fold into scalar AND/ORR/EOR,
7229 // but not the non-negated operand of BIC/ORN/EON.
7230 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
7232 break;
7233 for (auto &Op : I->operands()) {
7234 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
7235 if (match(Op.get(), m_Not(m_Value()))) {
7236 Ops.push_back(&Op);
7237 return true;
7238 }
7239 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
7240 if (match(Op.get(),
7242 m_Value(), m_ZeroMask()))) {
7243 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7244 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7245 Ops.push_back(&Not);
7246 Ops.push_back(&InsertElt);
7247 Ops.push_back(&Op);
7248 return true;
7249 }
7250 }
7251 break;
7252 default:
7253 break;
7254 }
7255
7256 if (!I->getType()->isVectorTy())
7257 return !Ops.empty();
7258
7259 switch (I->getOpcode()) {
7260 case Instruction::Sub:
7261 case Instruction::Add: {
7262 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7263 return false;
7264
7265 // If the exts' operands extract either the lower or upper elements, we
7266 // can sink them too.
7267 auto Ext1 = cast<Instruction>(I->getOperand(0));
7268 auto Ext2 = cast<Instruction>(I->getOperand(1));
7269 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7270 Ops.push_back(&Ext1->getOperandUse(0));
7271 Ops.push_back(&Ext2->getOperandUse(0));
7272 }
7273
7274 Ops.push_back(&I->getOperandUse(0));
7275 Ops.push_back(&I->getOperandUse(1));
7276
7277 return true;
7278 }
7279 case Instruction::Or: {
7280 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7281 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7282 if (ST->hasNEON()) {
7283 Instruction *OtherAnd, *IA, *IB;
7284 Value *MaskValue;
7285 // MainAnd refers to And instruction that has 'Not' as one of its operands
7286 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7287 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7288 m_Instruction(IA)))))) {
7289 if (match(OtherAnd,
7290 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7291 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7292 ? cast<Instruction>(I->getOperand(1))
7293 : cast<Instruction>(I->getOperand(0));
7294
7295 // Both Ands should be in same basic block as Or
7296 if (I->getParent() != MainAnd->getParent() ||
7297 I->getParent() != OtherAnd->getParent())
7298 return false;
7299
7300 // Non-mask operands of both Ands should also be in same basic block
7301 if (I->getParent() != IA->getParent() ||
7302 I->getParent() != IB->getParent())
7303 return false;
7304
7305 Ops.push_back(
7306 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7307 Ops.push_back(&I->getOperandUse(0));
7308 Ops.push_back(&I->getOperandUse(1));
7309
7310 return true;
7311 }
7312 }
7313 }
7314
7315 return false;
7316 }
7317 case Instruction::Mul: {
7318 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7319 auto *Ty = cast<VectorType>(V->getType());
7320 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7321 if (Ty->isScalableTy())
7322 return false;
7323
7324 // Indexed variants of Mul exist for i16 and i32 element types only.
7325 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7326 };
7327
7328 int NumZExts = 0, NumSExts = 0;
7329 for (auto &Op : I->operands()) {
7330 // Make sure we are not already sinking this operand
7331 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7332 continue;
7333
7334 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7335 auto *Ext = cast<Instruction>(Op);
7336 auto *ExtOp = Ext->getOperand(0);
7337 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7338 Ops.push_back(&Ext->getOperandUse(0));
7339 Ops.push_back(&Op);
7340
7341 if (isa<SExtInst>(Ext)) {
7342 NumSExts++;
7343 } else {
7344 NumZExts++;
7345 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7346 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7347 I->getType()->getScalarSizeInBits())
7348 NumSExts++;
7349 }
7350
7351 continue;
7352 }
7353
7355 if (!Shuffle)
7356 continue;
7357
7358 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7359 // operand and the s/zext can help create indexed s/umull. This is
7360 // especially useful to prevent i64 mul being scalarized.
7361 if (isSplatShuffle(Shuffle) &&
7362 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7363 Ops.push_back(&Shuffle->getOperandUse(0));
7364 Ops.push_back(&Op);
7365 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7366 NumSExts++;
7367 else
7368 NumZExts++;
7369 continue;
7370 }
7371
7372 Value *ShuffleOperand = Shuffle->getOperand(0);
7373 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7374 if (!Insert)
7375 continue;
7376
7377 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7378 if (!OperandInstr)
7379 continue;
7380
7381 ConstantInt *ElementConstant =
7382 dyn_cast<ConstantInt>(Insert->getOperand(2));
7383 // Check that the insertelement is inserting into element 0
7384 if (!ElementConstant || !ElementConstant->isZero())
7385 continue;
7386
7387 unsigned Opcode = OperandInstr->getOpcode();
7388 if (Opcode == Instruction::SExt)
7389 NumSExts++;
7390 else if (Opcode == Instruction::ZExt)
7391 NumZExts++;
7392 else {
7393 // If we find that the top bits are known 0, then we can sink and allow
7394 // the backend to generate a umull.
7395 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7396 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7397 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7398 continue;
7399 NumZExts++;
7400 }
7401
7402 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7403 // the And, just to hoist it again back to the load.
7404 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7405 Ops.push_back(&Insert->getOperandUse(1));
7406 Ops.push_back(&Shuffle->getOperandUse(0));
7407 Ops.push_back(&Op);
7408 }
7409
7410 // It is profitable to sink if we found two of the same type of extends.
7411 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7412 return true;
7413
7414 // Otherwise, see if we should sink splats for indexed variants.
7415 if (!ShouldSinkSplatForIndexedVariant(I))
7416 return false;
7417
7418 Ops.clear();
7419 if (isSplatShuffle(I->getOperand(0)))
7420 Ops.push_back(&I->getOperandUse(0));
7421 if (isSplatShuffle(I->getOperand(1)))
7422 Ops.push_back(&I->getOperandUse(1));
7423
7424 return !Ops.empty();
7425 }
7426 case Instruction::FMul: {
7427 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7428 if (I->getType()->isScalableTy())
7429 return !Ops.empty();
7430
7431 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7432 !ST->hasFullFP16())
7433 return !Ops.empty();
7434
7435 // Sink splats for index lane variants
7436 if (isSplatShuffle(I->getOperand(0)))
7437 Ops.push_back(&I->getOperandUse(0));
7438 if (isSplatShuffle(I->getOperand(1)))
7439 Ops.push_back(&I->getOperandUse(1));
7440 return !Ops.empty();
7441 }
7442 default:
7443 return false;
7444 }
7445 return false;
7446}
static bool isAllActivePredicate(const SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPairwiseAddLong(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineSVEVectorMlaU(InstCombiner &IC, IntrinsicInst &II)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
static constexpr Value * getValue(Ty &ValueOrUse)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
InstructionCost getBranchMispredictPenalty() const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
unsigned getMaxInterleaveFactor(ElementCount VF, bool HasUnorderedReductions) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:254
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839
bool isUnsigned() const
Definition InstrTypes.h:999
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
bool empty() const
Definition DenseMap.h:171
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:214
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
bool approxFunc() const
Definition FMF.h:70
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2617
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2605
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:547
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:567
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:534
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:552
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2000
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:482
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2314
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2529
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1737
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1439
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2232
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1906
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2639
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1919
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1422
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:562
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2305
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
Value * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1126
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
bool isFixedLengthVector() const
MVT getVectorElementType() const
size_type size() const
Definition MapVector.h:58
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:889
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:736
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:993
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool ShouldCheckWrap=true, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:382
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool isFixedLengthVector() const
Definition ValueTypes.h:199
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:187
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
bool isVariant() const
Definition MCSchedule.h:150
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:264
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...