LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
658 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
659 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
660 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
661 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
662 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
663 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
664 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
665 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
666 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(RetTy);
669 const auto *Entry =
670 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(ValidSatTys, equal_to(LT.second)))
688 return LT.first * Instrs;
689
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702 MVT::nxv4i32, MVT::nxv2i64};
703 auto LT = getTypeLegalizationCost(RetTy);
704 if (any_of(ValidAbsTys, equal_to(LT.second)))
705 return LT.first;
706 break;
707 }
708 case Intrinsic::bswap: {
709 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710 MVT::v4i32, MVT::v2i64};
711 auto LT = getTypeLegalizationCost(RetTy);
712 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
713 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714 return LT.first;
715 break;
716 }
717 case Intrinsic::fma:
718 case Intrinsic::fmuladd: {
719 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
720 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721 Type *EltTy = RetTy->getScalarType();
722 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
723 (EltTy->isHalfTy() && ST->hasFullFP16()))
724 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
725 break;
726 }
727 case Intrinsic::stepvector: {
728 InstructionCost Cost = 1; // Cost of the `index' instruction
729 auto LT = getTypeLegalizationCost(RetTy);
730 // Legalisation of illegal vectors involves an `index' instruction plus
731 // (LT.first - 1) vector adds.
732 if (LT.first > 1) {
733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
734 InstructionCost AddCost =
735 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
736 Cost += AddCost * (LT.first - 1);
737 }
738 return Cost;
739 }
740 case Intrinsic::vector_extract:
741 case Intrinsic::vector_insert: {
742 // If both the vector and subvector types are legal types and the index
743 // is 0, then this should be a no-op or simple operation; return a
744 // relatively low cost.
745
746 // If arguments aren't actually supplied, then we cannot determine the
747 // value of the index. We also want to skip predicate types.
748 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
750 break;
751
752 LLVMContext &C = RetTy->getContext();
753 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
754 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
756 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
757 // Skip this if either the vector or subvector types are unpacked
758 // SVE types; they may get lowered to stack stores and loads.
759 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
760 break;
761
763 getTLI()->getTypeConversion(C, SubVecVT);
765 getTLI()->getTypeConversion(C, VecVT);
766 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
767 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
768 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770 return TTI::TCC_Free;
771 break;
772 }
773 case Intrinsic::bitreverse: {
774 static const CostTblEntry BitreverseTbl[] = {
775 {Intrinsic::bitreverse, MVT::i32, 1},
776 {Intrinsic::bitreverse, MVT::i64, 1},
777 {Intrinsic::bitreverse, MVT::v8i8, 1},
778 {Intrinsic::bitreverse, MVT::v16i8, 1},
779 {Intrinsic::bitreverse, MVT::v4i16, 2},
780 {Intrinsic::bitreverse, MVT::v8i16, 2},
781 {Intrinsic::bitreverse, MVT::v2i32, 2},
782 {Intrinsic::bitreverse, MVT::v4i32, 2},
783 {Intrinsic::bitreverse, MVT::v1i64, 2},
784 {Intrinsic::bitreverse, MVT::v2i64, 2},
785 };
786 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
787 const auto *Entry =
788 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
789 if (Entry) {
790 // Cost Model is using the legal type(i32) that i8 and i16 will be
791 // converted to +1 so that we match the actual lowering cost
792 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
793 TLI->getValueType(DL, RetTy, true) == MVT::i16)
794 return LegalisationCost.first * Entry->Cost + 1;
795
796 return LegalisationCost.first * Entry->Cost;
797 }
798 break;
799 }
800 case Intrinsic::ctpop: {
801 if (!ST->hasNEON()) {
802 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
803 return getTypeLegalizationCost(RetTy).first * 12;
804 }
805 static const CostTblEntry CtpopCostTbl[] = {
806 {ISD::CTPOP, MVT::v2i64, 4},
807 {ISD::CTPOP, MVT::v4i32, 3},
808 {ISD::CTPOP, MVT::v8i16, 2},
809 {ISD::CTPOP, MVT::v16i8, 1},
810 {ISD::CTPOP, MVT::i64, 4},
811 {ISD::CTPOP, MVT::v2i32, 3},
812 {ISD::CTPOP, MVT::v4i16, 2},
813 {ISD::CTPOP, MVT::v8i8, 1},
814 {ISD::CTPOP, MVT::i32, 5},
815 // SVE types (For targets that override NEON for fixed length vectors)
816 {ISD::CTPOP, MVT::nxv2i64, 1},
817 {ISD::CTPOP, MVT::nxv4i32, 1},
818 {ISD::CTPOP, MVT::nxv8i16, 1},
819 {ISD::CTPOP, MVT::nxv16i8, 1},
820 };
821 auto LT = getTypeLegalizationCost(RetTy);
822 MVT MTy = LT.second;
823
824 // When SVE is available CNT will be used for fixed and scalable vectors.
825 if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector())
827 128 / MTy.getScalarSizeInBits());
828
829 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
830 // Extra cost of +1 when illegal vector types are legalized by promoting
831 // the integer type.
832 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
833 RetTy->getScalarSizeInBits()
834 ? 1
835 : 0;
836 return LT.first * Entry->Cost + ExtraCost;
837 }
838 break;
839 }
840 case Intrinsic::sadd_with_overflow:
841 case Intrinsic::uadd_with_overflow:
842 case Intrinsic::ssub_with_overflow:
843 case Intrinsic::usub_with_overflow:
844 case Intrinsic::smul_with_overflow:
845 case Intrinsic::umul_with_overflow: {
846 static const CostTblEntry WithOverflowCostTbl[] = {
847 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
848 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
849 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
850 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
851 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
852 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
853 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
854 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
855 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
856 {Intrinsic::usub_with_overflow, MVT::i8, 3},
857 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
858 {Intrinsic::usub_with_overflow, MVT::i16, 3},
859 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
860 {Intrinsic::usub_with_overflow, MVT::i32, 1},
861 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
862 {Intrinsic::usub_with_overflow, MVT::i64, 1},
863 {Intrinsic::smul_with_overflow, MVT::i8, 5},
864 {Intrinsic::umul_with_overflow, MVT::i8, 4},
865 {Intrinsic::smul_with_overflow, MVT::i16, 5},
866 {Intrinsic::umul_with_overflow, MVT::i16, 4},
867 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
868 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
869 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
870 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
871 };
872 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
873 if (MTy.isSimple())
874 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
875 MTy.getSimpleVT()))
876 return Entry->Cost;
877 break;
878 }
879 case Intrinsic::fptosi_sat:
880 case Intrinsic::fptoui_sat: {
881 if (ICA.getArgTypes().empty())
882 break;
883 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
884 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
885 EVT MTy = TLI->getValueType(DL, RetTy);
886 // Check for the legal types, which are where the size of the input and the
887 // output are the same, or we are using cvt f64->i32 or f32->i64.
888 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
889 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
890 LT.second == MVT::v2f64)) {
891 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
892 (LT.second == MVT::f64 && MTy == MVT::i32) ||
893 (LT.second == MVT::f32 && MTy == MVT::i64)))
894 return LT.first;
895 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
896 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
897 MTy.getScalarSizeInBits() == 64)
898 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
899 }
900 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
901 // f32.
902 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
903 return LT.first + getIntrinsicInstrCost(
904 {ICA.getID(),
905 RetTy,
906 {ICA.getArgTypes()[0]->getWithNewType(
907 Type::getFloatTy(RetTy->getContext()))}},
908 CostKind);
909 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
910 (LT.second == MVT::f16 && MTy == MVT::i64) ||
911 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
912 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
913 return LT.first;
914 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
915 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
916 MTy.getScalarSizeInBits() == 32)
917 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
918 // Extending vector types v8f16->v8i32. These current scalarize but the
919 // codegen could be better.
920 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
921 MTy.getScalarSizeInBits() == 64)
922 return MTy.getVectorNumElements() * 3;
923
924 // If we can we use a legal convert followed by a min+max
925 if ((LT.second.getScalarType() == MVT::f32 ||
926 LT.second.getScalarType() == MVT::f64 ||
927 LT.second.getScalarType() == MVT::f16) &&
928 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
929 Type *LegalTy =
930 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
931 if (LT.second.isVector())
932 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
934 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
935 : Intrinsic::umin,
936 LegalTy, {LegalTy, LegalTy});
938 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
939 : Intrinsic::umax,
940 LegalTy, {LegalTy, LegalTy});
942 return LT.first * Cost +
943 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
944 : 1);
945 }
946 // Otherwise we need to follow the default expansion that clamps the value
947 // using a float min/max with a fcmp+sel for nan handling when signed.
948 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
949 RetTy = RetTy->getScalarType();
950 if (LT.second.isVector()) {
951 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
952 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
953 }
954 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
956 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
958 Cost +=
959 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
961 if (IsSigned) {
962 Type *CondTy = RetTy->getWithNewBitWidth(1);
963 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
965 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
967 }
968 return LT.first * Cost;
969 }
970 case Intrinsic::fshl:
971 case Intrinsic::fshr: {
972 if (ICA.getArgs().empty())
973 break;
974
975 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
976
977 // ROTR / ROTL is a funnel shift with equal first and second operand. For
978 // ROTR on integer registers (i32/i64) this can be done in a single ror
979 // instruction. A fshl with a non-constant shift uses a neg + ror.
980 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
981 (RetTy->getPrimitiveSizeInBits() == 32 ||
982 RetTy->getPrimitiveSizeInBits() == 64)) {
983 InstructionCost NegCost =
984 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
985 return 1 + NegCost;
986 }
987
988 // TODO: Add handling for fshl where third argument is not a constant.
989 if (!OpInfoZ.isConstant())
990 break;
991
992 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
993 if (OpInfoZ.isUniform()) {
994 static const CostTblEntry FshlTbl[] = {
995 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
996 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
997 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
998 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
999 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
1000 // to avoid having to duplicate the costs.
1001 const auto *Entry =
1002 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
1003 if (Entry)
1004 return LegalisationCost.first * Entry->Cost;
1005 }
1006
1007 auto TyL = getTypeLegalizationCost(RetTy);
1008 if (!RetTy->isIntegerTy())
1009 break;
1010
1011 // Estimate cost manually, as types like i8 and i16 will get promoted to
1012 // i32 and CostTableLookup will ignore the extra conversion cost.
1013 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1014 RetTy->getScalarSizeInBits() < 64) ||
1015 (RetTy->getScalarSizeInBits() % 64 != 0);
1016 unsigned ExtraCost = HigherCost ? 1 : 0;
1017 if (RetTy->getScalarSizeInBits() == 32 ||
1018 RetTy->getScalarSizeInBits() == 64)
1019 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1020 // extr instruction.
1021 else if (HigherCost)
1022 ExtraCost = 1;
1023 else
1024 break;
1025 return TyL.first + ExtraCost;
1026 }
1027 case Intrinsic::get_active_lane_mask: {
1028 auto RetTy = cast<VectorType>(ICA.getReturnType());
1029 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1030 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1031 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1032 break;
1033
1034 if (RetTy->isScalableTy()) {
1035 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1037 break;
1038
1039 auto LT = getTypeLegalizationCost(RetTy);
1040 InstructionCost Cost = LT.first;
1041 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1042 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1043 // nxv32i1 = get_active_lane_mask(base, idx) ->
1044 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1045 if (ST->hasSVE2p1() || ST->hasSME2()) {
1046 Cost /= 2;
1047 if (Cost == 1)
1048 return Cost;
1049 }
1050
1051 // If more than one whilelo intrinsic is required, include the extra cost
1052 // required by the saturating add & select required to increment the
1053 // start value after the first intrinsic call.
1054 Type *OpTy = ICA.getArgTypes()[0];
1055 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1056 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1057 Type *CondTy = OpTy->getWithNewBitWidth(1);
1058 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1060 return Cost + (SplitCost * (Cost - 1));
1061 } else if (!getTLI()->isTypeLegal(RetVT)) {
1062 // We don't have enough context at this point to determine if the mask
1063 // is going to be kept live after the block, which will force the vXi1
1064 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1065 // For now, we just assume the vectorizer created this intrinsic and
1066 // the result will be the input for a PHI. In this case the cost will
1067 // be extremely high for fixed-width vectors.
1068 // NOTE: getScalarizationOverhead returns a cost that's far too
1069 // pessimistic for the actual generated codegen. In reality there are
1070 // two instructions generated per lane.
1071 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1072 }
1073 break;
1074 }
1075 case Intrinsic::experimental_vector_match: {
1076 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1077 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1078 unsigned SearchSize = NeedleTy->getNumElements();
1079 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1080 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1081 // Neoverse V3, these are cheap operations with the same latency as a
1082 // vector ADD. In most cases, however, we also need to do an extra DUP.
1083 // For fixed-length vectors we currently need an extra five--six
1084 // instructions besides the MATCH.
1086 if (isa<FixedVectorType>(RetTy))
1087 Cost += 10;
1088 return Cost;
1089 }
1090 break;
1091 }
1092 case Intrinsic::cttz: {
1093 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1094 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1095 return LT.first * 2;
1096 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1097 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1098 return LT.first * 3;
1099 break;
1100 }
1101 case Intrinsic::experimental_cttz_elts: {
1102 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1103 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1104 // This will consist of a SVE brkb and a cntp instruction. These
1105 // typically have the same latency and half the throughput as a vector
1106 // add instruction.
1107 return 4;
1108 }
1109 break;
1110 }
1111 case Intrinsic::loop_dependence_raw_mask:
1112 case Intrinsic::loop_dependence_war_mask: {
1113 // The whilewr/rw instructions require SVE2 or SME.
1114 if (ST->hasSVE2() || ST->hasSME()) {
1115 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1116 unsigned EltSizeInBytes =
1117 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1118 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1119 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1120 break;
1121 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1122 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1123 }
1124 break;
1125 }
1126 case Intrinsic::experimental_vector_extract_last_active:
1127 if (ST->isSVEorStreamingSVEAvailable()) {
1128 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1129 // This should turn into chained clastb instructions.
1130 return LegalCost;
1131 }
1132 break;
1133 case Intrinsic::pow: {
1134 // For scalar calls we know the target has the libcall, and for fixed-width
1135 // vectors we know for the worst case it can be scalarised.
1136 EVT VT = getTLI()->getValueType(DL, RetTy);
1137 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1138 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1139 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1140
1141 // If we know that the call can be lowered with libcalls then it's safe to
1142 // reduce the costs in some cases. This is important for scalable vectors,
1143 // since we cannot scalarize the call in the absence of a vector math
1144 // library.
1145 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1146 // If we know the fast math flags and the exponent is a constant then the
1147 // cost may be less for some exponents like 0.25 and 0.75.
1148 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1149 if (ExpC && isa<VectorType>(ExpC->getType()))
1150 ExpC = ExpC->getSplatValue();
1151 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1152 // The argument must be a FP constant.
1153 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1154 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1155 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1156 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1157 (!Is025 || FMF.noSignedZeros())) {
1158 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1160 if (Is025)
1161 return 2 * Sqrt;
1163 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1164 return (Sqrt * 2) + FMul;
1165 }
1166 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1167 // cheaper than pow.
1168 }
1169 }
1170
1171 if (HasLibcall)
1172 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1173 break;
1174 }
1175 case Intrinsic::sqrt:
1176 case Intrinsic::fabs:
1177 case Intrinsic::ceil:
1178 case Intrinsic::floor:
1179 case Intrinsic::nearbyint:
1180 case Intrinsic::round:
1181 case Intrinsic::rint:
1182 case Intrinsic::roundeven:
1183 case Intrinsic::trunc:
1184 case Intrinsic::minnum:
1185 case Intrinsic::maxnum:
1186 case Intrinsic::minimum:
1187 case Intrinsic::maximum: {
1188 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1189 auto LT = getTypeLegalizationCost(RetTy);
1190 return LT.first;
1191 }
1192 break;
1193 }
1194 default:
1195 break;
1196 }
1198}
1199
1200/// The function will remove redundant reinterprets casting in the presence
1201/// of the control flow
1202static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1203 IntrinsicInst &II) {
1205 auto RequiredType = II.getType();
1206
1207 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1208 assert(PN && "Expected Phi Node!");
1209
1210 // Don't create a new Phi unless we can remove the old one.
1211 if (!PN->hasOneUse())
1212 return std::nullopt;
1213
1214 for (Value *IncValPhi : PN->incoming_values()) {
1215 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1216 if (!Reinterpret ||
1217 Reinterpret->getIntrinsicID() !=
1218 Intrinsic::aarch64_sve_convert_to_svbool ||
1219 RequiredType != Reinterpret->getArgOperand(0)->getType())
1220 return std::nullopt;
1221 }
1222
1223 // Create the new Phi
1224 IC.Builder.SetInsertPoint(PN);
1225 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1226 Worklist.push_back(PN);
1227
1228 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1229 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1230 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1231 Worklist.push_back(Reinterpret);
1232 }
1233
1234 // Cleanup Phi Node and reinterprets
1235 return IC.replaceInstUsesWith(II, NPN);
1236}
1237
1238// A collection of properties common to SVE intrinsics that allow for combines
1239// to be written without needing to know the specific intrinsic.
1241 //
1242 // Helper routines for common intrinsic definitions.
1243 //
1244
1245 // e.g. llvm.aarch64.sve.add pg, op1, op2
1246 // with IID ==> llvm.aarch64.sve.add_u
1247 static SVEIntrinsicInfo
1254
1255 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1262
1263 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1269
1270 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1276
1277 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1278 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1279 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1280 return SVEIntrinsicInfo()
1283 }
1284
1285 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1286 // llvm.aarch64.sve.ld1 pg, ptr
1293
1294 // All properties relate to predication and thus having a general predicate
1295 // is the minimum requirement to say there is intrinsic info to act on.
1296 explicit operator bool() const { return hasGoverningPredicate(); }
1297
1298 //
1299 // Properties relating to the governing predicate.
1300 //
1301
1303 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1304 }
1305
1307 assert(hasGoverningPredicate() && "Propery not set!");
1308 return GoverningPredicateIdx;
1309 }
1310
1312 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1313 GoverningPredicateIdx = Index;
1314 return *this;
1315 }
1316
1317 //
1318 // Properties relating to operations the intrinsic could be transformed into.
1319 // NOTE: This does not mean such a transformation is always possible, but the
1320 // knowledge makes it possible to reuse existing optimisations without needing
1321 // to embed specific handling for each intrinsic. For example, instruction
1322 // simplification can be used to optimise an intrinsic's active lanes.
1323 //
1324
1326 return UndefIntrinsic != Intrinsic::not_intrinsic;
1327 }
1328
1330 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1331 return UndefIntrinsic;
1332 }
1333
1335 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1336 UndefIntrinsic = IID;
1337 return *this;
1338 }
1339
1340 bool hasMatchingIROpode() const { return IROpcode != 0; }
1341
1342 unsigned getMatchingIROpode() const {
1343 assert(hasMatchingIROpode() && "Propery not set!");
1344 return IROpcode;
1345 }
1346
1348 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1349 IROpcode = Opcode;
1350 return *this;
1351 }
1352
1353 //
1354 // Properties relating to the result of inactive lanes.
1355 //
1356
1358 return ResultLanes == InactiveLanesTakenFromOperand;
1359 }
1360
1362 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1363 return OperandIdxForInactiveLanes;
1364 }
1365
1367 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1368 ResultLanes = InactiveLanesTakenFromOperand;
1369 OperandIdxForInactiveLanes = Index;
1370 return *this;
1371 }
1372
1374 return ResultLanes == InactiveLanesAreNotDefined;
1375 }
1376
1378 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1379 ResultLanes = InactiveLanesAreNotDefined;
1380 return *this;
1381 }
1382
1384 return ResultLanes == InactiveLanesAreUnused;
1385 }
1386
1388 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1389 ResultLanes = InactiveLanesAreUnused;
1390 return *this;
1391 }
1392
1393 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1394 // inactiveLanesAreZeroed =
1395 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1396 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1397
1399 ResultIsZeroInitialized = true;
1400 return *this;
1401 }
1402
1403 //
1404 // The first operand of unary merging operations is typically only used to
1405 // set the result for inactive lanes. Knowing this allows us to deadcode the
1406 // operand when we can prove there are no inactive lanes.
1407 //
1408
1410 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1411 }
1412
1414 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1415 return OperandIdxWithNoActiveLanes;
1416 }
1417
1419 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1420 OperandIdxWithNoActiveLanes = Index;
1421 return *this;
1422 }
1423
1424private:
1425 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1426
1427 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1428 unsigned IROpcode = 0;
1429
1430 enum PredicationStyle {
1432 InactiveLanesTakenFromOperand,
1433 InactiveLanesAreNotDefined,
1434 InactiveLanesAreUnused
1435 } ResultLanes = Uninitialized;
1436
1437 bool ResultIsZeroInitialized = false;
1438 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1439 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1440};
1441
1443 // Some SVE intrinsics do not use scalable vector types, but since they are
1444 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1445 if (!isa<ScalableVectorType>(II.getType()) &&
1446 all_of(II.args(), [&](const Value *V) {
1447 return !isa<ScalableVectorType>(V->getType());
1448 }))
1449 return SVEIntrinsicInfo();
1450
1451 Intrinsic::ID IID = II.getIntrinsicID();
1452 switch (IID) {
1453 default:
1454 break;
1455 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1456 case Intrinsic::aarch64_sve_fcvt_f16f32:
1457 case Intrinsic::aarch64_sve_fcvt_f16f64:
1458 case Intrinsic::aarch64_sve_fcvt_f32f16:
1459 case Intrinsic::aarch64_sve_fcvt_f32f64:
1460 case Intrinsic::aarch64_sve_fcvt_f64f16:
1461 case Intrinsic::aarch64_sve_fcvt_f64f32:
1462 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1463 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1464 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1465 case Intrinsic::aarch64_sve_fcvtzs:
1466 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1467 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1468 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1469 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1470 case Intrinsic::aarch64_sve_fcvtzu:
1471 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1472 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1473 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1474 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1475 case Intrinsic::aarch64_sve_revb:
1476 case Intrinsic::aarch64_sve_revh:
1477 case Intrinsic::aarch64_sve_revw:
1478 case Intrinsic::aarch64_sve_revd:
1479 case Intrinsic::aarch64_sve_scvtf:
1480 case Intrinsic::aarch64_sve_scvtf_f16i32:
1481 case Intrinsic::aarch64_sve_scvtf_f16i64:
1482 case Intrinsic::aarch64_sve_scvtf_f32i64:
1483 case Intrinsic::aarch64_sve_scvtf_f64i32:
1484 case Intrinsic::aarch64_sve_ucvtf:
1485 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1486 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1487 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1488 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1490
1491 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1492 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1493 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1494 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1496
1497 case Intrinsic::aarch64_sve_fabd:
1498 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1499 case Intrinsic::aarch64_sve_fadd:
1500 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1501 .setMatchingIROpcode(Instruction::FAdd);
1502 case Intrinsic::aarch64_sve_fdiv:
1503 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1504 .setMatchingIROpcode(Instruction::FDiv);
1505 case Intrinsic::aarch64_sve_fmax:
1506 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1507 case Intrinsic::aarch64_sve_fmaxnm:
1508 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1509 case Intrinsic::aarch64_sve_fmin:
1510 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1511 case Intrinsic::aarch64_sve_fminnm:
1512 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1513 case Intrinsic::aarch64_sve_fmla:
1514 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1515 case Intrinsic::aarch64_sve_fmls:
1516 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1517 case Intrinsic::aarch64_sve_fmul:
1518 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1519 .setMatchingIROpcode(Instruction::FMul);
1520 case Intrinsic::aarch64_sve_fmulx:
1521 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1522 case Intrinsic::aarch64_sve_fnmla:
1523 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1524 case Intrinsic::aarch64_sve_fnmls:
1525 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1526 case Intrinsic::aarch64_sve_fsub:
1527 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1528 .setMatchingIROpcode(Instruction::FSub);
1529 case Intrinsic::aarch64_sve_add:
1530 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1531 .setMatchingIROpcode(Instruction::Add);
1532 case Intrinsic::aarch64_sve_mla:
1533 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1534 case Intrinsic::aarch64_sve_mls:
1535 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1536 case Intrinsic::aarch64_sve_mul:
1537 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1538 .setMatchingIROpcode(Instruction::Mul);
1539 case Intrinsic::aarch64_sve_sabd:
1540 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1541 case Intrinsic::aarch64_sve_sdiv:
1542 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1543 .setMatchingIROpcode(Instruction::SDiv);
1544 case Intrinsic::aarch64_sve_smax:
1545 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1546 case Intrinsic::aarch64_sve_smin:
1547 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1548 case Intrinsic::aarch64_sve_smulh:
1549 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1550 case Intrinsic::aarch64_sve_sub:
1551 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1552 .setMatchingIROpcode(Instruction::Sub);
1553 case Intrinsic::aarch64_sve_uabd:
1554 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1555 case Intrinsic::aarch64_sve_udiv:
1556 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1557 .setMatchingIROpcode(Instruction::UDiv);
1558 case Intrinsic::aarch64_sve_umax:
1559 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1560 case Intrinsic::aarch64_sve_umin:
1561 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1562 case Intrinsic::aarch64_sve_umulh:
1563 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1564 case Intrinsic::aarch64_sve_asr:
1565 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1566 .setMatchingIROpcode(Instruction::AShr);
1567 case Intrinsic::aarch64_sve_lsl:
1568 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1569 .setMatchingIROpcode(Instruction::Shl);
1570 case Intrinsic::aarch64_sve_lsr:
1571 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1572 .setMatchingIROpcode(Instruction::LShr);
1573 case Intrinsic::aarch64_sve_and:
1574 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1575 .setMatchingIROpcode(Instruction::And);
1576 case Intrinsic::aarch64_sve_bic:
1577 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1578 case Intrinsic::aarch64_sve_eor:
1579 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1580 .setMatchingIROpcode(Instruction::Xor);
1581 case Intrinsic::aarch64_sve_orr:
1582 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1583 .setMatchingIROpcode(Instruction::Or);
1584 case Intrinsic::aarch64_sve_shsub:
1585 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1586 case Intrinsic::aarch64_sve_shsubr:
1588 case Intrinsic::aarch64_sve_sqrshl:
1589 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1590 case Intrinsic::aarch64_sve_sqshl:
1591 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1592 case Intrinsic::aarch64_sve_sqsub:
1593 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1594 case Intrinsic::aarch64_sve_srshl:
1595 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1596 case Intrinsic::aarch64_sve_uhsub:
1597 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1598 case Intrinsic::aarch64_sve_uhsubr:
1600 case Intrinsic::aarch64_sve_uqrshl:
1601 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1602 case Intrinsic::aarch64_sve_uqshl:
1603 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1604 case Intrinsic::aarch64_sve_uqsub:
1605 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1606 case Intrinsic::aarch64_sve_urshl:
1607 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1608
1609 case Intrinsic::aarch64_sve_add_u:
1611 Instruction::Add);
1612 case Intrinsic::aarch64_sve_and_u:
1614 Instruction::And);
1615 case Intrinsic::aarch64_sve_asr_u:
1617 Instruction::AShr);
1618 case Intrinsic::aarch64_sve_eor_u:
1620 Instruction::Xor);
1621 case Intrinsic::aarch64_sve_fadd_u:
1623 Instruction::FAdd);
1624 case Intrinsic::aarch64_sve_fdiv_u:
1626 Instruction::FDiv);
1627 case Intrinsic::aarch64_sve_fmul_u:
1629 Instruction::FMul);
1630 case Intrinsic::aarch64_sve_fsub_u:
1632 Instruction::FSub);
1633 case Intrinsic::aarch64_sve_lsl_u:
1635 Instruction::Shl);
1636 case Intrinsic::aarch64_sve_lsr_u:
1638 Instruction::LShr);
1639 case Intrinsic::aarch64_sve_mul_u:
1641 Instruction::Mul);
1642 case Intrinsic::aarch64_sve_orr_u:
1644 Instruction::Or);
1645 case Intrinsic::aarch64_sve_sdiv_u:
1647 Instruction::SDiv);
1648 case Intrinsic::aarch64_sve_sub_u:
1650 Instruction::Sub);
1651 case Intrinsic::aarch64_sve_udiv_u:
1653 Instruction::UDiv);
1654
1655 case Intrinsic::aarch64_sve_addqv:
1656 case Intrinsic::aarch64_sve_and_z:
1657 case Intrinsic::aarch64_sve_bic_z:
1658 case Intrinsic::aarch64_sve_brka_z:
1659 case Intrinsic::aarch64_sve_brkb_z:
1660 case Intrinsic::aarch64_sve_brkn_z:
1661 case Intrinsic::aarch64_sve_brkpa_z:
1662 case Intrinsic::aarch64_sve_brkpb_z:
1663 case Intrinsic::aarch64_sve_cntp:
1664 case Intrinsic::aarch64_sve_compact:
1665 case Intrinsic::aarch64_sve_eor_z:
1666 case Intrinsic::aarch64_sve_eorv:
1667 case Intrinsic::aarch64_sve_eorqv:
1668 case Intrinsic::aarch64_sve_nand_z:
1669 case Intrinsic::aarch64_sve_nor_z:
1670 case Intrinsic::aarch64_sve_orn_z:
1671 case Intrinsic::aarch64_sve_orr_z:
1672 case Intrinsic::aarch64_sve_orv:
1673 case Intrinsic::aarch64_sve_orqv:
1674 case Intrinsic::aarch64_sve_pnext:
1675 case Intrinsic::aarch64_sve_rdffr_z:
1676 case Intrinsic::aarch64_sve_saddv:
1677 case Intrinsic::aarch64_sve_uaddv:
1678 case Intrinsic::aarch64_sve_umaxv:
1679 case Intrinsic::aarch64_sve_umaxqv:
1680 case Intrinsic::aarch64_sve_cmpeq:
1681 case Intrinsic::aarch64_sve_cmpeq_wide:
1682 case Intrinsic::aarch64_sve_cmpge:
1683 case Intrinsic::aarch64_sve_cmpge_wide:
1684 case Intrinsic::aarch64_sve_cmpgt:
1685 case Intrinsic::aarch64_sve_cmpgt_wide:
1686 case Intrinsic::aarch64_sve_cmphi:
1687 case Intrinsic::aarch64_sve_cmphi_wide:
1688 case Intrinsic::aarch64_sve_cmphs:
1689 case Intrinsic::aarch64_sve_cmphs_wide:
1690 case Intrinsic::aarch64_sve_cmple_wide:
1691 case Intrinsic::aarch64_sve_cmplo_wide:
1692 case Intrinsic::aarch64_sve_cmpls_wide:
1693 case Intrinsic::aarch64_sve_cmplt_wide:
1694 case Intrinsic::aarch64_sve_cmpne:
1695 case Intrinsic::aarch64_sve_cmpne_wide:
1696 case Intrinsic::aarch64_sve_facge:
1697 case Intrinsic::aarch64_sve_facgt:
1698 case Intrinsic::aarch64_sve_fcmpeq:
1699 case Intrinsic::aarch64_sve_fcmpge:
1700 case Intrinsic::aarch64_sve_fcmpgt:
1701 case Intrinsic::aarch64_sve_fcmpne:
1702 case Intrinsic::aarch64_sve_fcmpuo:
1703 case Intrinsic::aarch64_sve_ld1:
1704 case Intrinsic::aarch64_sve_ld1_gather:
1705 case Intrinsic::aarch64_sve_ld1_gather_index:
1706 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1707 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1708 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1709 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1710 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1711 case Intrinsic::aarch64_sve_ld1q_gather_index:
1712 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1713 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1714 case Intrinsic::aarch64_sve_ld1ro:
1715 case Intrinsic::aarch64_sve_ld1rq:
1716 case Intrinsic::aarch64_sve_ld1udq:
1717 case Intrinsic::aarch64_sve_ld1uwq:
1718 case Intrinsic::aarch64_sve_ld2_sret:
1719 case Intrinsic::aarch64_sve_ld2q_sret:
1720 case Intrinsic::aarch64_sve_ld3_sret:
1721 case Intrinsic::aarch64_sve_ld3q_sret:
1722 case Intrinsic::aarch64_sve_ld4_sret:
1723 case Intrinsic::aarch64_sve_ld4q_sret:
1724 case Intrinsic::aarch64_sve_ldff1:
1725 case Intrinsic::aarch64_sve_ldff1_gather:
1726 case Intrinsic::aarch64_sve_ldff1_gather_index:
1727 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1728 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1729 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1730 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1731 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1732 case Intrinsic::aarch64_sve_ldnf1:
1733 case Intrinsic::aarch64_sve_ldnt1:
1734 case Intrinsic::aarch64_sve_ldnt1_gather:
1735 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1736 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1737 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1739
1740 case Intrinsic::aarch64_sve_prf:
1741 case Intrinsic::aarch64_sve_prfb_gather_index:
1742 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1743 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1744 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1745 case Intrinsic::aarch64_sve_prfd_gather_index:
1746 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1747 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1748 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1749 case Intrinsic::aarch64_sve_prfh_gather_index:
1750 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1751 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1752 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1753 case Intrinsic::aarch64_sve_prfw_gather_index:
1754 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1755 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1756 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1758
1759 case Intrinsic::aarch64_sve_st1_scatter:
1760 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1761 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1762 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1763 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1764 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1765 case Intrinsic::aarch64_sve_st1dq:
1766 case Intrinsic::aarch64_sve_st1q_scatter_index:
1767 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1768 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1769 case Intrinsic::aarch64_sve_st1wq:
1770 case Intrinsic::aarch64_sve_stnt1:
1771 case Intrinsic::aarch64_sve_stnt1_scatter:
1772 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1773 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1774 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1776 case Intrinsic::aarch64_sve_st2:
1777 case Intrinsic::aarch64_sve_st2q:
1779 case Intrinsic::aarch64_sve_st3:
1780 case Intrinsic::aarch64_sve_st3q:
1782 case Intrinsic::aarch64_sve_st4:
1783 case Intrinsic::aarch64_sve_st4q:
1785 }
1786
1787 return SVEIntrinsicInfo();
1788}
1789
1790static bool isAllActivePredicate(Value *Pred) {
1791 Value *UncastedPred;
1792
1793 // Look through predicate casts that only remove lanes.
1795 m_Value(UncastedPred)))) {
1796 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1797 Pred = UncastedPred;
1798
1800 m_Value(UncastedPred))))
1801 // If the predicate has the same or less lanes than the uncasted predicate
1802 // then we know the casting has no effect.
1803 if (OrigPredTy->getMinNumElements() <=
1804 cast<ScalableVectorType>(UncastedPred->getType())
1805 ->getMinNumElements())
1806 Pred = UncastedPred;
1807 }
1808
1809 auto *C = dyn_cast<Constant>(Pred);
1810 return C && C->isAllOnesValue();
1811}
1812
1813// Simplify `V` by only considering the operations that affect active lanes.
1814// This function should only return existing Values or newly created Constants.
1815static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1816 auto *Dup = dyn_cast<IntrinsicInst>(V);
1817 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1818 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1820 cast<VectorType>(V->getType())->getElementCount(),
1821 cast<Constant>(Dup->getOperand(2)));
1822
1823 return V;
1824}
1825
1826static std::optional<Instruction *>
1828 const SVEIntrinsicInfo &IInfo) {
1829 const unsigned Opc = IInfo.getMatchingIROpode();
1830 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1831
1832 Value *Pg = II.getOperand(0);
1833 Value *Op1 = II.getOperand(1);
1834 Value *Op2 = II.getOperand(2);
1835 const DataLayout &DL = II.getDataLayout();
1836
1837 // Canonicalise constants to the RHS.
1839 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1840 IC.replaceOperand(II, 1, Op2);
1841 IC.replaceOperand(II, 2, Op1);
1842 return &II;
1843 }
1844
1845 // Only active lanes matter when simplifying the operation.
1846 Op1 = stripInactiveLanes(Op1, Pg);
1847 Op2 = stripInactiveLanes(Op2, Pg);
1848
1849 Value *SimpleII;
1850 if (auto FII = dyn_cast<FPMathOperator>(&II))
1851 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1852 else
1853 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1854
1855 // An SVE intrinsic's result is always defined. However, this is not the case
1856 // for its equivalent IR instruction (e.g. when shifting by an amount more
1857 // than the data's bitwidth). Simplifications to an undefined result must be
1858 // ignored to preserve the intrinsic's expected behaviour.
1859 if (!SimpleII || isa<UndefValue>(SimpleII))
1860 return std::nullopt;
1861
1862 if (IInfo.inactiveLanesAreNotDefined())
1863 return IC.replaceInstUsesWith(II, SimpleII);
1864
1865 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1866
1867 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1868 if (SimpleII == Inactive)
1869 return IC.replaceInstUsesWith(II, SimpleII);
1870
1871 // Inactive lanes must be preserved.
1872 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1873 return IC.replaceInstUsesWith(II, SimpleII);
1874}
1875
1876// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1877// to operations with less strict inactive lane requirements.
1878static std::optional<Instruction *>
1880 const SVEIntrinsicInfo &IInfo) {
1881 if (!IInfo.hasGoverningPredicate())
1882 return std::nullopt;
1883
1884 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1885
1886 // If there are no active lanes.
1887 if (match(OpPredicate, m_ZeroInt())) {
1889 return IC.replaceInstUsesWith(
1890 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1891
1892 if (IInfo.inactiveLanesAreUnused()) {
1893 if (IInfo.resultIsZeroInitialized())
1895
1896 return IC.eraseInstFromFunction(II);
1897 }
1898 }
1899
1900 // If there are no inactive lanes.
1901 if (isAllActivePredicate(OpPredicate)) {
1902 if (IInfo.hasOperandWithNoActiveLanes()) {
1903 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1904 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1905 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1906 }
1907
1908 if (IInfo.hasMatchingUndefIntrinsic()) {
1909 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1910 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1911 II.setCalledFunction(NewDecl);
1912 return &II;
1913 }
1914 }
1915
1916 // Operation specific simplifications.
1917 if (IInfo.hasMatchingIROpode() &&
1919 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1920
1921 return std::nullopt;
1922}
1923
1924// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1925// => (binop (pred) (from_svbool _) (from_svbool _))
1926//
1927// The above transformation eliminates a `to_svbool` in the predicate
1928// operand of bitwise operation `binop` by narrowing the vector width of
1929// the operation. For example, it would convert a `<vscale x 16 x i1>
1930// and` into a `<vscale x 4 x i1> and`. This is profitable because
1931// to_svbool must zero the new lanes during widening, whereas
1932// from_svbool is free.
1933static std::optional<Instruction *>
1935 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1936 if (!BinOp)
1937 return std::nullopt;
1938
1939 auto IntrinsicID = BinOp->getIntrinsicID();
1940 switch (IntrinsicID) {
1941 case Intrinsic::aarch64_sve_and_z:
1942 case Intrinsic::aarch64_sve_bic_z:
1943 case Intrinsic::aarch64_sve_eor_z:
1944 case Intrinsic::aarch64_sve_nand_z:
1945 case Intrinsic::aarch64_sve_nor_z:
1946 case Intrinsic::aarch64_sve_orn_z:
1947 case Intrinsic::aarch64_sve_orr_z:
1948 break;
1949 default:
1950 return std::nullopt;
1951 }
1952
1953 auto BinOpPred = BinOp->getOperand(0);
1954 auto BinOpOp1 = BinOp->getOperand(1);
1955 auto BinOpOp2 = BinOp->getOperand(2);
1956
1957 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1958 if (!PredIntr ||
1959 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1960 return std::nullopt;
1961
1962 auto PredOp = PredIntr->getOperand(0);
1963 auto PredOpTy = cast<VectorType>(PredOp->getType());
1964 if (PredOpTy != II.getType())
1965 return std::nullopt;
1966
1967 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1968 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1969 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1970 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1971 if (BinOpOp1 == BinOpOp2)
1972 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1973 else
1974 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1975 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1976
1977 auto NarrowedBinOp =
1978 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1979 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1980}
1981
1982static std::optional<Instruction *>
1984 // If the reinterpret instruction operand is a PHI Node
1985 if (isa<PHINode>(II.getArgOperand(0)))
1986 return processPhiNode(IC, II);
1987
1988 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1989 return BinOpCombine;
1990
1991 // Ignore converts to/from svcount_t.
1992 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1993 isa<TargetExtType>(II.getType()))
1994 return std::nullopt;
1995
1996 SmallVector<Instruction *, 32> CandidatesForRemoval;
1997 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1998
1999 const auto *IVTy = cast<VectorType>(II.getType());
2000
2001 // Walk the chain of conversions.
2002 while (Cursor) {
2003 // If the type of the cursor has fewer lanes than the final result, zeroing
2004 // must take place, which breaks the equivalence chain.
2005 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
2006 if (CursorVTy->getElementCount().getKnownMinValue() <
2007 IVTy->getElementCount().getKnownMinValue())
2008 break;
2009
2010 // If the cursor has the same type as I, it is a viable replacement.
2011 if (Cursor->getType() == IVTy)
2012 EarliestReplacement = Cursor;
2013
2014 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2015
2016 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2017 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2018 Intrinsic::aarch64_sve_convert_to_svbool ||
2019 IntrinsicCursor->getIntrinsicID() ==
2020 Intrinsic::aarch64_sve_convert_from_svbool))
2021 break;
2022
2023 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2024 Cursor = IntrinsicCursor->getOperand(0);
2025 }
2026
2027 // If no viable replacement in the conversion chain was found, there is
2028 // nothing to do.
2029 if (!EarliestReplacement)
2030 return std::nullopt;
2031
2032 return IC.replaceInstUsesWith(II, EarliestReplacement);
2033}
2034
2035static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2036 IntrinsicInst &II) {
2037 // svsel(ptrue, x, y) => x
2038 auto *OpPredicate = II.getOperand(0);
2039 if (isAllActivePredicate(OpPredicate))
2040 return IC.replaceInstUsesWith(II, II.getOperand(1));
2041
2042 auto Select =
2043 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2044 return IC.replaceInstUsesWith(II, Select);
2045}
2046
2047static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2048 IntrinsicInst &II) {
2049 Value *Pg = II.getOperand(1);
2050
2051 // sve.dup(V, all_active, X) ==> splat(X)
2052 if (isAllActivePredicate(Pg)) {
2053 auto *RetTy = cast<ScalableVectorType>(II.getType());
2054 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2055 II.getArgOperand(2));
2056 return IC.replaceInstUsesWith(II, Splat);
2057 }
2058
2060 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2061 return std::nullopt;
2062
2063 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2064 Value *Insert = IC.Builder.CreateInsertElement(
2065 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2066 return IC.replaceInstUsesWith(II, Insert);
2067}
2068
2069static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2070 IntrinsicInst &II) {
2071 // Replace DupX with a regular IR splat.
2072 auto *RetTy = cast<ScalableVectorType>(II.getType());
2073 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2074 II.getArgOperand(0));
2075 Splat->takeName(&II);
2076 return IC.replaceInstUsesWith(II, Splat);
2077}
2078
2079static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2080 IntrinsicInst &II) {
2081 LLVMContext &Ctx = II.getContext();
2082
2083 if (!isAllActivePredicate(II.getArgOperand(0)))
2084 return std::nullopt;
2085
2086 // Check that we have a compare of zero..
2087 auto *SplatValue =
2089 if (!SplatValue || !SplatValue->isZero())
2090 return std::nullopt;
2091
2092 // ..against a dupq
2093 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2094 if (!DupQLane ||
2095 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2096 return std::nullopt;
2097
2098 // Where the dupq is a lane 0 replicate of a vector insert
2099 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2100 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2101 return std::nullopt;
2102
2103 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2104 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2105 return std::nullopt;
2106
2107 // Where the vector insert is a fixed constant vector insert into undef at
2108 // index zero
2109 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2110 return std::nullopt;
2111
2112 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2113 return std::nullopt;
2114
2115 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2116 if (!ConstVec)
2117 return std::nullopt;
2118
2119 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2120 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2121 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2122 return std::nullopt;
2123
2124 unsigned NumElts = VecTy->getNumElements();
2125 unsigned PredicateBits = 0;
2126
2127 // Expand intrinsic operands to a 16-bit byte level predicate
2128 for (unsigned I = 0; I < NumElts; ++I) {
2129 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2130 if (!Arg)
2131 return std::nullopt;
2132 if (!Arg->isZero())
2133 PredicateBits |= 1 << (I * (16 / NumElts));
2134 }
2135
2136 // If all bits are zero bail early with an empty predicate
2137 if (PredicateBits == 0) {
2138 auto *PFalse = Constant::getNullValue(II.getType());
2139 PFalse->takeName(&II);
2140 return IC.replaceInstUsesWith(II, PFalse);
2141 }
2142
2143 // Calculate largest predicate type used (where byte predicate is largest)
2144 unsigned Mask = 8;
2145 for (unsigned I = 0; I < 16; ++I)
2146 if ((PredicateBits & (1 << I)) != 0)
2147 Mask |= (I % 8);
2148
2149 unsigned PredSize = Mask & -Mask;
2150 auto *PredType = ScalableVectorType::get(
2151 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2152
2153 // Ensure all relevant bits are set
2154 for (unsigned I = 0; I < 16; I += PredSize)
2155 if ((PredicateBits & (1 << I)) == 0)
2156 return std::nullopt;
2157
2158 auto *ConvertToSVBool =
2159 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
2160 PredType, ConstantInt::getTrue(PredType));
2161 auto *ConvertFromSVBool =
2162 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2163 II.getType(), ConvertToSVBool);
2164
2165 ConvertFromSVBool->takeName(&II);
2166 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2167}
2168
2169static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2170 IntrinsicInst &II) {
2171 Value *Pg = II.getArgOperand(0);
2172 Value *Vec = II.getArgOperand(1);
2173 auto IntrinsicID = II.getIntrinsicID();
2174 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2175
2176 // lastX(splat(X)) --> X
2177 if (auto *SplatVal = getSplatValue(Vec))
2178 return IC.replaceInstUsesWith(II, SplatVal);
2179
2180 // If x and/or y is a splat value then:
2181 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2182 Value *LHS, *RHS;
2183 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2184 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2185 auto *OldBinOp = cast<BinaryOperator>(Vec);
2186 auto OpC = OldBinOp->getOpcode();
2187 auto *NewLHS =
2188 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2189 auto *NewRHS =
2190 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2192 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2193 return IC.replaceInstUsesWith(II, NewBinOp);
2194 }
2195 }
2196
2197 auto *C = dyn_cast<Constant>(Pg);
2198 if (IsAfter && C && C->isNullValue()) {
2199 // The intrinsic is extracting lane 0 so use an extract instead.
2200 auto *IdxTy = Type::getInt64Ty(II.getContext());
2201 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2202 Extract->insertBefore(II.getIterator());
2203 Extract->takeName(&II);
2204 return IC.replaceInstUsesWith(II, Extract);
2205 }
2206
2207 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2208 if (!IntrPG)
2209 return std::nullopt;
2210
2211 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2212 return std::nullopt;
2213
2214 const auto PTruePattern =
2215 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2216
2217 // Can the intrinsic's predicate be converted to a known constant index?
2218 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2219 if (!MinNumElts)
2220 return std::nullopt;
2221
2222 unsigned Idx = MinNumElts - 1;
2223 // Increment the index if extracting the element after the last active
2224 // predicate element.
2225 if (IsAfter)
2226 ++Idx;
2227
2228 // Ignore extracts whose index is larger than the known minimum vector
2229 // length. NOTE: This is an artificial constraint where we prefer to
2230 // maintain what the user asked for until an alternative is proven faster.
2231 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2232 if (Idx >= PgVTy->getMinNumElements())
2233 return std::nullopt;
2234
2235 // The intrinsic is extracting a fixed lane so use an extract instead.
2236 auto *IdxTy = Type::getInt64Ty(II.getContext());
2237 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2238 Extract->insertBefore(II.getIterator());
2239 Extract->takeName(&II);
2240 return IC.replaceInstUsesWith(II, Extract);
2241}
2242
2243static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2244 IntrinsicInst &II) {
2245 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2246 // integer variant across a variety of micro-architectures. Replace scalar
2247 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2248 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2249 // depending on the micro-architecture, but has been observed as generally
2250 // being faster, particularly when the CLAST[AB] op is a loop-carried
2251 // dependency.
2252 Value *Pg = II.getArgOperand(0);
2253 Value *Fallback = II.getArgOperand(1);
2254 Value *Vec = II.getArgOperand(2);
2255 Type *Ty = II.getType();
2256
2257 if (!Ty->isIntegerTy())
2258 return std::nullopt;
2259
2260 Type *FPTy;
2261 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2262 default:
2263 return std::nullopt;
2264 case 16:
2265 FPTy = IC.Builder.getHalfTy();
2266 break;
2267 case 32:
2268 FPTy = IC.Builder.getFloatTy();
2269 break;
2270 case 64:
2271 FPTy = IC.Builder.getDoubleTy();
2272 break;
2273 }
2274
2275 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2276 auto *FPVTy = VectorType::get(
2277 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2278 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2279 auto *FPII = IC.Builder.CreateIntrinsic(
2280 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2281 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2282 return IC.replaceInstUsesWith(II, FPIItoInt);
2283}
2284
2285static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2286 IntrinsicInst &II) {
2287 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2288 // can work with RDFFR_PP for ptest elimination.
2289 auto *RDFFR = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z,
2290 ConstantInt::getTrue(II.getType()));
2291 RDFFR->takeName(&II);
2292 return IC.replaceInstUsesWith(II, RDFFR);
2293}
2294
2295static std::optional<Instruction *>
2297 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2298
2299 if (Pattern == AArch64SVEPredPattern::all) {
2301 II.getType(), ElementCount::getScalable(NumElts));
2302 Cnt->takeName(&II);
2303 return IC.replaceInstUsesWith(II, Cnt);
2304 }
2305
2306 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2307
2308 return MinNumElts && NumElts >= MinNumElts
2309 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2310 II, ConstantInt::get(II.getType(), MinNumElts)))
2311 : std::nullopt;
2312}
2313
2314static std::optional<Instruction *>
2316 const AArch64Subtarget *ST) {
2317 if (!ST->isStreaming())
2318 return std::nullopt;
2319
2320 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2321 // with SVEPredPattern::all
2322 Value *Cnt =
2324 Cnt->takeName(&II);
2325 return IC.replaceInstUsesWith(II, Cnt);
2326}
2327
2328static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2329 IntrinsicInst &II) {
2330 Value *PgVal = II.getArgOperand(0);
2331 Value *OpVal = II.getArgOperand(1);
2332
2333 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2334 // Later optimizations prefer this form.
2335 if (PgVal == OpVal &&
2336 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2337 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2338 Value *Ops[] = {PgVal, OpVal};
2339 Type *Tys[] = {PgVal->getType()};
2340
2341 auto *PTest =
2342 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2343 PTest->takeName(&II);
2344
2345 return IC.replaceInstUsesWith(II, PTest);
2346 }
2347
2350
2351 if (!Pg || !Op)
2352 return std::nullopt;
2353
2354 Intrinsic::ID OpIID = Op->getIntrinsicID();
2355
2356 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2357 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2358 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2359 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2360 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2361
2362 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2363
2364 PTest->takeName(&II);
2365 return IC.replaceInstUsesWith(II, PTest);
2366 }
2367
2368 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2369 // Later optimizations may rewrite sequence to use the flag-setting variant
2370 // of instruction X to remove PTEST.
2371 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2372 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2373 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2374 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2375 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2376 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2377 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2378 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2379 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2380 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2381 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2382 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2383 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2384 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2385 Type *Tys[] = {Pg->getType()};
2386
2387 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2388 PTest->takeName(&II);
2389
2390 return IC.replaceInstUsesWith(II, PTest);
2391 }
2392
2393 return std::nullopt;
2394}
2395
2396template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2397static std::optional<Instruction *>
2399 bool MergeIntoAddendOp) {
2400 Value *P = II.getOperand(0);
2401 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2402 if (MergeIntoAddendOp) {
2403 AddendOp = II.getOperand(1);
2404 Mul = II.getOperand(2);
2405 } else {
2406 AddendOp = II.getOperand(2);
2407 Mul = II.getOperand(1);
2408 }
2409
2411 m_Value(MulOp1))))
2412 return std::nullopt;
2413
2414 if (!Mul->hasOneUse())
2415 return std::nullopt;
2416
2417 Instruction *FMFSource = nullptr;
2418 if (II.getType()->isFPOrFPVectorTy()) {
2419 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2420 // Stop the combine when the flags on the inputs differ in case dropping
2421 // flags would lead to us missing out on more beneficial optimizations.
2422 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2423 return std::nullopt;
2424 if (!FAddFlags.allowContract())
2425 return std::nullopt;
2426 FMFSource = &II;
2427 }
2428
2429 CallInst *Res;
2430 if (MergeIntoAddendOp)
2431 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2432 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2433 else
2434 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2435 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2436
2437 return IC.replaceInstUsesWith(II, Res);
2438}
2439
2440static std::optional<Instruction *>
2442 Value *Pred = II.getOperand(0);
2443 Value *PtrOp = II.getOperand(1);
2444 Type *VecTy = II.getType();
2445
2446 if (isAllActivePredicate(Pred)) {
2447 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2448 Load->copyMetadata(II);
2449 return IC.replaceInstUsesWith(II, Load);
2450 }
2451
2452 CallInst *MaskedLoad =
2453 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2454 Pred, ConstantAggregateZero::get(VecTy));
2455 MaskedLoad->copyMetadata(II);
2456 return IC.replaceInstUsesWith(II, MaskedLoad);
2457}
2458
2459static std::optional<Instruction *>
2461 Value *VecOp = II.getOperand(0);
2462 Value *Pred = II.getOperand(1);
2463 Value *PtrOp = II.getOperand(2);
2464
2465 if (isAllActivePredicate(Pred)) {
2466 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2467 Store->copyMetadata(II);
2468 return IC.eraseInstFromFunction(II);
2469 }
2470
2471 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2472 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2473 MaskedStore->copyMetadata(II);
2474 return IC.eraseInstFromFunction(II);
2475}
2476
2478 switch (Intrinsic) {
2479 case Intrinsic::aarch64_sve_fmul_u:
2480 return Instruction::BinaryOps::FMul;
2481 case Intrinsic::aarch64_sve_fadd_u:
2482 return Instruction::BinaryOps::FAdd;
2483 case Intrinsic::aarch64_sve_fsub_u:
2484 return Instruction::BinaryOps::FSub;
2485 default:
2486 return Instruction::BinaryOpsEnd;
2487 }
2488}
2489
2490static std::optional<Instruction *>
2492 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2493 if (II.isStrictFP())
2494 return std::nullopt;
2495
2496 auto *OpPredicate = II.getOperand(0);
2497 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2498 if (BinOpCode == Instruction::BinaryOpsEnd ||
2499 !isAllActivePredicate(OpPredicate))
2500 return std::nullopt;
2501 auto BinOp = IC.Builder.CreateBinOpFMF(
2502 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2503 return IC.replaceInstUsesWith(II, BinOp);
2504}
2505
2506static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2507 IntrinsicInst &II) {
2508 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2509 Intrinsic::aarch64_sve_mla>(
2510 IC, II, true))
2511 return MLA;
2512 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2513 Intrinsic::aarch64_sve_mad>(
2514 IC, II, false))
2515 return MAD;
2516 return std::nullopt;
2517}
2518
2519static std::optional<Instruction *>
2521 if (auto FMLA =
2522 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2523 Intrinsic::aarch64_sve_fmla>(IC, II,
2524 true))
2525 return FMLA;
2526 if (auto FMAD =
2527 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2528 Intrinsic::aarch64_sve_fmad>(IC, II,
2529 false))
2530 return FMAD;
2531 if (auto FMLA =
2532 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2533 Intrinsic::aarch64_sve_fmla>(IC, II,
2534 true))
2535 return FMLA;
2536 return std::nullopt;
2537}
2538
2539static std::optional<Instruction *>
2541 if (auto FMLA =
2542 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2543 Intrinsic::aarch64_sve_fmla>(IC, II,
2544 true))
2545 return FMLA;
2546 if (auto FMAD =
2547 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2548 Intrinsic::aarch64_sve_fmad>(IC, II,
2549 false))
2550 return FMAD;
2551 if (auto FMLA_U =
2552 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2553 Intrinsic::aarch64_sve_fmla_u>(
2554 IC, II, true))
2555 return FMLA_U;
2556 return instCombineSVEVectorBinOp(IC, II);
2557}
2558
2559static std::optional<Instruction *>
2561 if (auto FMLS =
2562 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2563 Intrinsic::aarch64_sve_fmls>(IC, II,
2564 true))
2565 return FMLS;
2566 if (auto FMSB =
2567 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2568 Intrinsic::aarch64_sve_fnmsb>(
2569 IC, II, false))
2570 return FMSB;
2571 if (auto FMLS =
2572 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2573 Intrinsic::aarch64_sve_fmls>(IC, II,
2574 true))
2575 return FMLS;
2576 return std::nullopt;
2577}
2578
2579static std::optional<Instruction *>
2581 if (auto FMLS =
2582 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2583 Intrinsic::aarch64_sve_fmls>(IC, II,
2584 true))
2585 return FMLS;
2586 if (auto FMSB =
2587 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2588 Intrinsic::aarch64_sve_fnmsb>(
2589 IC, II, false))
2590 return FMSB;
2591 if (auto FMLS_U =
2592 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2593 Intrinsic::aarch64_sve_fmls_u>(
2594 IC, II, true))
2595 return FMLS_U;
2596 return instCombineSVEVectorBinOp(IC, II);
2597}
2598
2599static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2600 IntrinsicInst &II) {
2601 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2602 Intrinsic::aarch64_sve_mls>(
2603 IC, II, true))
2604 return MLS;
2605 return std::nullopt;
2606}
2607
2608static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2609 IntrinsicInst &II) {
2610 Value *UnpackArg = II.getArgOperand(0);
2611 auto *RetTy = cast<ScalableVectorType>(II.getType());
2612 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2613 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2614
2615 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2616 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2617 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2618 ScalarArg =
2619 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2620 Value *NewVal =
2621 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2622 NewVal->takeName(&II);
2623 return IC.replaceInstUsesWith(II, NewVal);
2624 }
2625
2626 return std::nullopt;
2627}
2628static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2629 IntrinsicInst &II) {
2630 auto *OpVal = II.getOperand(0);
2631 auto *OpIndices = II.getOperand(1);
2632 VectorType *VTy = cast<VectorType>(II.getType());
2633
2634 // Check whether OpIndices is a constant splat value < minimal element count
2635 // of result.
2636 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2637 if (!SplatValue ||
2638 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2639 return std::nullopt;
2640
2641 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2642 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2643 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2644 auto *VectorSplat =
2645 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2646
2647 VectorSplat->takeName(&II);
2648 return IC.replaceInstUsesWith(II, VectorSplat);
2649}
2650
2651static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2652 IntrinsicInst &II) {
2653 Value *A, *B;
2654 Type *RetTy = II.getType();
2655 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2656 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2657
2658 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2659 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2660 if ((match(II.getArgOperand(0),
2662 match(II.getArgOperand(1),
2664 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2665 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2666 auto *TyA = cast<ScalableVectorType>(A->getType());
2667 if (TyA == B->getType() &&
2669 auto *SubVec = IC.Builder.CreateInsertVector(
2670 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2671 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2672 TyA->getMinNumElements());
2673 ConcatVec->takeName(&II);
2674 return IC.replaceInstUsesWith(II, ConcatVec);
2675 }
2676 }
2677
2678 return std::nullopt;
2679}
2680
2681static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2682 IntrinsicInst &II) {
2683 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2684 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2685 Value *A, *B;
2686 if (match(II.getArgOperand(0),
2689 m_Specific(A), m_Specific(B))))
2690 return IC.replaceInstUsesWith(
2691 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2692
2693 return std::nullopt;
2694}
2695
2696static std::optional<Instruction *>
2698 Value *Mask = II.getOperand(0);
2699 Value *BasePtr = II.getOperand(1);
2700 Value *Index = II.getOperand(2);
2701 Type *Ty = II.getType();
2702 Value *PassThru = ConstantAggregateZero::get(Ty);
2703
2704 // Contiguous gather => masked load.
2705 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2706 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2707 Value *IndexBase;
2709 m_One()))) {
2710 Align Alignment =
2711 BasePtr->getPointerAlignment(II.getDataLayout());
2712
2713 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2714 BasePtr, IndexBase);
2715 CallInst *MaskedLoad =
2716 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2717 MaskedLoad->takeName(&II);
2718 return IC.replaceInstUsesWith(II, MaskedLoad);
2719 }
2720
2721 return std::nullopt;
2722}
2723
2724static std::optional<Instruction *>
2726 Value *Val = II.getOperand(0);
2727 Value *Mask = II.getOperand(1);
2728 Value *BasePtr = II.getOperand(2);
2729 Value *Index = II.getOperand(3);
2730 Type *Ty = Val->getType();
2731
2732 // Contiguous scatter => masked store.
2733 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2734 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2735 Value *IndexBase;
2737 m_One()))) {
2738 Align Alignment =
2739 BasePtr->getPointerAlignment(II.getDataLayout());
2740
2741 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2742 BasePtr, IndexBase);
2743 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2744
2745 return IC.eraseInstFromFunction(II);
2746 }
2747
2748 return std::nullopt;
2749}
2750
2751static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2752 IntrinsicInst &II) {
2754 Value *Pred = II.getOperand(0);
2755 Value *Vec = II.getOperand(1);
2756 Value *DivVec = II.getOperand(2);
2757
2758 Value *SplatValue = getSplatValue(DivVec);
2759 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2760 if (!SplatConstantInt)
2761 return std::nullopt;
2762
2763 APInt Divisor = SplatConstantInt->getValue();
2764 const int64_t DivisorValue = Divisor.getSExtValue();
2765 if (DivisorValue == -1)
2766 return std::nullopt;
2767 if (DivisorValue == 1)
2768 IC.replaceInstUsesWith(II, Vec);
2769
2770 if (Divisor.isPowerOf2()) {
2771 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2772 auto ASRD = IC.Builder.CreateIntrinsic(
2773 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2774 return IC.replaceInstUsesWith(II, ASRD);
2775 }
2776 if (Divisor.isNegatedPowerOf2()) {
2777 Divisor.negate();
2778 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2779 auto ASRD = IC.Builder.CreateIntrinsic(
2780 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2781 auto NEG = IC.Builder.CreateIntrinsic(
2782 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2783 return IC.replaceInstUsesWith(II, NEG);
2784 }
2785
2786 return std::nullopt;
2787}
2788
2789bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2790 size_t VecSize = Vec.size();
2791 if (VecSize == 1)
2792 return true;
2793 if (!isPowerOf2_64(VecSize))
2794 return false;
2795 size_t HalfVecSize = VecSize / 2;
2796
2797 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2798 RHS != Vec.end(); LHS++, RHS++) {
2799 if (*LHS != nullptr && *RHS != nullptr) {
2800 if (*LHS == *RHS)
2801 continue;
2802 else
2803 return false;
2804 }
2805 if (!AllowPoison)
2806 return false;
2807 if (*LHS == nullptr && *RHS != nullptr)
2808 *LHS = *RHS;
2809 }
2810
2811 Vec.resize(HalfVecSize);
2812 SimplifyValuePattern(Vec, AllowPoison);
2813 return true;
2814}
2815
2816// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2817// to dupqlane(f64(C)) where C is A concatenated with B
2818static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2819 IntrinsicInst &II) {
2820 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2821 if (!match(II.getOperand(0),
2823 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2824 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2825 return std::nullopt;
2826 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2827
2828 // Insert the scalars into a container ordered by InsertElement index
2829 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2830 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2831 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2832 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2833 CurrentInsertElt = InsertElt->getOperand(0);
2834 }
2835
2836 bool AllowPoison =
2837 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2838 if (!SimplifyValuePattern(Elts, AllowPoison))
2839 return std::nullopt;
2840
2841 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2842 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2843 for (size_t I = 0; I < Elts.size(); I++) {
2844 if (Elts[I] == nullptr)
2845 continue;
2846 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2847 IC.Builder.getInt64(I));
2848 }
2849 if (InsertEltChain == nullptr)
2850 return std::nullopt;
2851
2852 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2853 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2854 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2855 // be narrowed back to the original type.
2856 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2857 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2858 IIScalableTy->getMinNumElements() /
2859 PatternWidth;
2860
2861 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2862 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2863 auto *WideShuffleMaskTy =
2864 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2865
2866 auto InsertSubvector = IC.Builder.CreateInsertVector(
2867 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2868 uint64_t(0));
2869 auto WideBitcast =
2870 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2871 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2872 auto WideShuffle = IC.Builder.CreateShuffleVector(
2873 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2874 auto NarrowBitcast =
2875 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2876
2877 return IC.replaceInstUsesWith(II, NarrowBitcast);
2878}
2879
2880static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2881 IntrinsicInst &II) {
2882 Value *A = II.getArgOperand(0);
2883 Value *B = II.getArgOperand(1);
2884 if (A == B)
2885 return IC.replaceInstUsesWith(II, A);
2886
2887 return std::nullopt;
2888}
2889
2890static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2891 IntrinsicInst &II) {
2892 Value *Pred = II.getOperand(0);
2893 Value *Vec = II.getOperand(1);
2894 Value *Shift = II.getOperand(2);
2895
2896 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2897 Value *AbsPred, *MergedValue;
2899 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2901 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2902
2903 return std::nullopt;
2904
2905 // Transform is valid if any of the following are true:
2906 // * The ABS merge value is an undef or non-negative
2907 // * The ABS predicate is all active
2908 // * The ABS predicate and the SRSHL predicates are the same
2909 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2910 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2911 return std::nullopt;
2912
2913 // Only valid when the shift amount is non-negative, otherwise the rounding
2914 // behaviour of SRSHL cannot be ignored.
2915 if (!match(Shift, m_NonNegative()))
2916 return std::nullopt;
2917
2918 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2919 {II.getType()}, {Pred, Vec, Shift});
2920
2921 return IC.replaceInstUsesWith(II, LSL);
2922}
2923
2924static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2925 IntrinsicInst &II) {
2926 Value *Vec = II.getOperand(0);
2927
2928 if (getSplatValue(Vec) == II.getOperand(1))
2929 return IC.replaceInstUsesWith(II, Vec);
2930
2931 return std::nullopt;
2932}
2933
2934static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2935 IntrinsicInst &II) {
2936 // If this barrier is post-dominated by identical one we can remove it
2937 auto *NI = II.getNextNode();
2938 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2939 auto CanSkipOver = [](Instruction *I) {
2940 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2941 };
2942 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2943 auto *NIBB = NI->getParent();
2944 NI = NI->getNextNode();
2945 if (!NI) {
2946 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2947 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2948 else
2949 break;
2950 }
2951 }
2952 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2953 if (NextII && II.isIdenticalTo(NextII))
2954 return IC.eraseInstFromFunction(II);
2955
2956 return std::nullopt;
2957}
2958
2959static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2960 IntrinsicInst &II) {
2961 return IC.replaceInstUsesWith(
2962 II,
2963 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2964 {II.getType(), II.getOperand(0)->getType()},
2965 {II.getOperand(0), II.getOperand(1)}));
2966}
2967
2968static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2969 IntrinsicInst &II) {
2971 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2972 return std::nullopt;
2973}
2974
2975static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2977 unsigned NumBits) {
2978 Value *Passthru = II.getOperand(0);
2979 Value *Pg = II.getOperand(1);
2980 Value *Op = II.getOperand(2);
2981
2982 // Convert UXT[BHW] to AND.
2983 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2984 auto *Ty = cast<VectorType>(II.getType());
2985 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2986 auto *Mask = ConstantInt::get(Ty, MaskValue);
2987 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2988 {Pg, Op, Mask});
2989 return IC.replaceInstUsesWith(II, And);
2990 }
2991
2992 return std::nullopt;
2993}
2994
2995static std::optional<Instruction *>
2997 SMEAttrs FnSMEAttrs(*II.getFunction());
2998 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2999 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
3000 return IC.replaceInstUsesWith(
3001 II, ConstantInt::getBool(II.getType(), IsStreaming));
3002 return std::nullopt;
3003}
3004
3005std::optional<Instruction *>
3007 IntrinsicInst &II) const {
3009 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3010 return I;
3011
3012 Intrinsic::ID IID = II.getIntrinsicID();
3013 switch (IID) {
3014 default:
3015 break;
3016 case Intrinsic::aarch64_dmb:
3017 return instCombineDMB(IC, II);
3018 case Intrinsic::aarch64_neon_fmaxnm:
3019 case Intrinsic::aarch64_neon_fminnm:
3020 return instCombineMaxMinNM(IC, II);
3021 case Intrinsic::aarch64_sve_convert_from_svbool:
3022 return instCombineConvertFromSVBool(IC, II);
3023 case Intrinsic::aarch64_sve_dup:
3024 return instCombineSVEDup(IC, II);
3025 case Intrinsic::aarch64_sve_dup_x:
3026 return instCombineSVEDupX(IC, II);
3027 case Intrinsic::aarch64_sve_cmpne:
3028 case Intrinsic::aarch64_sve_cmpne_wide:
3029 return instCombineSVECmpNE(IC, II);
3030 case Intrinsic::aarch64_sve_rdffr:
3031 return instCombineRDFFR(IC, II);
3032 case Intrinsic::aarch64_sve_lasta:
3033 case Intrinsic::aarch64_sve_lastb:
3034 return instCombineSVELast(IC, II);
3035 case Intrinsic::aarch64_sve_clasta_n:
3036 case Intrinsic::aarch64_sve_clastb_n:
3037 return instCombineSVECondLast(IC, II);
3038 case Intrinsic::aarch64_sve_cntd:
3039 return instCombineSVECntElts(IC, II, 2);
3040 case Intrinsic::aarch64_sve_cntw:
3041 return instCombineSVECntElts(IC, II, 4);
3042 case Intrinsic::aarch64_sve_cnth:
3043 return instCombineSVECntElts(IC, II, 8);
3044 case Intrinsic::aarch64_sve_cntb:
3045 return instCombineSVECntElts(IC, II, 16);
3046 case Intrinsic::aarch64_sme_cntsd:
3047 return instCombineSMECntsd(IC, II, ST);
3048 case Intrinsic::aarch64_sve_ptest_any:
3049 case Intrinsic::aarch64_sve_ptest_first:
3050 case Intrinsic::aarch64_sve_ptest_last:
3051 return instCombineSVEPTest(IC, II);
3052 case Intrinsic::aarch64_sve_fadd:
3053 return instCombineSVEVectorFAdd(IC, II);
3054 case Intrinsic::aarch64_sve_fadd_u:
3055 return instCombineSVEVectorFAddU(IC, II);
3056 case Intrinsic::aarch64_sve_fmul_u:
3057 return instCombineSVEVectorBinOp(IC, II);
3058 case Intrinsic::aarch64_sve_fsub:
3059 return instCombineSVEVectorFSub(IC, II);
3060 case Intrinsic::aarch64_sve_fsub_u:
3061 return instCombineSVEVectorFSubU(IC, II);
3062 case Intrinsic::aarch64_sve_add:
3063 return instCombineSVEVectorAdd(IC, II);
3064 case Intrinsic::aarch64_sve_add_u:
3065 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3066 Intrinsic::aarch64_sve_mla_u>(
3067 IC, II, true);
3068 case Intrinsic::aarch64_sve_sub:
3069 return instCombineSVEVectorSub(IC, II);
3070 case Intrinsic::aarch64_sve_sub_u:
3071 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3072 Intrinsic::aarch64_sve_mls_u>(
3073 IC, II, true);
3074 case Intrinsic::aarch64_sve_tbl:
3075 return instCombineSVETBL(IC, II);
3076 case Intrinsic::aarch64_sve_uunpkhi:
3077 case Intrinsic::aarch64_sve_uunpklo:
3078 case Intrinsic::aarch64_sve_sunpkhi:
3079 case Intrinsic::aarch64_sve_sunpklo:
3080 return instCombineSVEUnpack(IC, II);
3081 case Intrinsic::aarch64_sve_uzp1:
3082 return instCombineSVEUzp1(IC, II);
3083 case Intrinsic::aarch64_sve_zip1:
3084 case Intrinsic::aarch64_sve_zip2:
3085 return instCombineSVEZip(IC, II);
3086 case Intrinsic::aarch64_sve_ld1_gather_index:
3087 return instCombineLD1GatherIndex(IC, II);
3088 case Intrinsic::aarch64_sve_st1_scatter_index:
3089 return instCombineST1ScatterIndex(IC, II);
3090 case Intrinsic::aarch64_sve_ld1:
3091 return instCombineSVELD1(IC, II, DL);
3092 case Intrinsic::aarch64_sve_st1:
3093 return instCombineSVEST1(IC, II, DL);
3094 case Intrinsic::aarch64_sve_sdiv:
3095 return instCombineSVESDIV(IC, II);
3096 case Intrinsic::aarch64_sve_sel:
3097 return instCombineSVESel(IC, II);
3098 case Intrinsic::aarch64_sve_srshl:
3099 return instCombineSVESrshl(IC, II);
3100 case Intrinsic::aarch64_sve_dupq_lane:
3101 return instCombineSVEDupqLane(IC, II);
3102 case Intrinsic::aarch64_sve_insr:
3103 return instCombineSVEInsr(IC, II);
3104 case Intrinsic::aarch64_sve_whilelo:
3105 return instCombineWhilelo(IC, II);
3106 case Intrinsic::aarch64_sve_ptrue:
3107 return instCombinePTrue(IC, II);
3108 case Intrinsic::aarch64_sve_uxtb:
3109 return instCombineSVEUxt(IC, II, 8);
3110 case Intrinsic::aarch64_sve_uxth:
3111 return instCombineSVEUxt(IC, II, 16);
3112 case Intrinsic::aarch64_sve_uxtw:
3113 return instCombineSVEUxt(IC, II, 32);
3114 case Intrinsic::aarch64_sme_in_streaming_mode:
3115 return instCombineInStreamingMode(IC, II);
3116 }
3117
3118 return std::nullopt;
3119}
3120
3122 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3123 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3124 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3125 SimplifyAndSetOp) const {
3126 switch (II.getIntrinsicID()) {
3127 default:
3128 break;
3129 case Intrinsic::aarch64_neon_fcvtxn:
3130 case Intrinsic::aarch64_neon_rshrn:
3131 case Intrinsic::aarch64_neon_sqrshrn:
3132 case Intrinsic::aarch64_neon_sqrshrun:
3133 case Intrinsic::aarch64_neon_sqshrn:
3134 case Intrinsic::aarch64_neon_sqshrun:
3135 case Intrinsic::aarch64_neon_sqxtn:
3136 case Intrinsic::aarch64_neon_sqxtun:
3137 case Intrinsic::aarch64_neon_uqrshrn:
3138 case Intrinsic::aarch64_neon_uqshrn:
3139 case Intrinsic::aarch64_neon_uqxtn:
3140 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3141 break;
3142 }
3143
3144 return std::nullopt;
3145}
3146
3148 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3150}
3151
3154 switch (K) {
3156 return TypeSize::getFixed(64);
3158 if (ST->useSVEForFixedLengthVectors() &&
3159 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3160 return TypeSize::getFixed(
3161 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3162 else if (ST->isNeonAvailable())
3163 return TypeSize::getFixed(128);
3164 else
3165 return TypeSize::getFixed(0);
3167 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3169 return TypeSize::getScalable(128);
3170 else
3171 return TypeSize::getScalable(0);
3172 }
3173 llvm_unreachable("Unsupported register kind");
3174}
3175
3176bool AArch64TTIImpl::isSingleExtWideningInstruction(
3177 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3178 Type *SrcOverrideTy) const {
3179 // A helper that returns a vector type from the given type. The number of
3180 // elements in type Ty determines the vector width.
3181 auto toVectorTy = [&](Type *ArgTy) {
3182 return VectorType::get(ArgTy->getScalarType(),
3183 cast<VectorType>(DstTy)->getElementCount());
3184 };
3185
3186 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3187 // i32, i64]. SVE doesn't generally have the same set of instructions to
3188 // perform an extend with the add/sub/mul. There are SMULLB style
3189 // instructions, but they operate on top/bottom, requiring some sort of lane
3190 // interleaving to be used with zext/sext.
3191 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3192 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3193 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3194 return false;
3195
3196 Type *SrcTy = SrcOverrideTy;
3197 switch (Opcode) {
3198 case Instruction::Add: // UADDW(2), SADDW(2).
3199 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3200 // The second operand needs to be an extend
3201 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3202 if (!SrcTy)
3203 SrcTy =
3204 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3205 break;
3206 }
3207
3208 if (Opcode == Instruction::Sub)
3209 return false;
3210
3211 // UADDW(2), SADDW(2) can be commutted.
3212 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3213 if (!SrcTy)
3214 SrcTy =
3215 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3216 break;
3217 }
3218 return false;
3219 }
3220 default:
3221 return false;
3222 }
3223
3224 // Legalize the destination type and ensure it can be used in a widening
3225 // operation.
3226 auto DstTyL = getTypeLegalizationCost(DstTy);
3227 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3228 return false;
3229
3230 // Legalize the source type and ensure it can be used in a widening
3231 // operation.
3232 assert(SrcTy && "Expected some SrcTy");
3233 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3234 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3235 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3236 return false;
3237
3238 // Get the total number of vector elements in the legalized types.
3239 InstructionCost NumDstEls =
3240 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3241 InstructionCost NumSrcEls =
3242 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3243
3244 // Return true if the legalized types have the same number of vector elements
3245 // and the destination element type size is twice that of the source type.
3246 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3247}
3248
3249Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3251 Type *SrcOverrideTy) const {
3252 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3253 Opcode != Instruction::Mul)
3254 return nullptr;
3255
3256 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3257 // i32, i64]. SVE doesn't generally have the same set of instructions to
3258 // perform an extend with the add/sub/mul. There are SMULLB style
3259 // instructions, but they operate on top/bottom, requiring some sort of lane
3260 // interleaving to be used with zext/sext.
3261 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3262 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3263 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3264 return nullptr;
3265
3266 auto getScalarSizeWithOverride = [&](const Value *V) {
3267 if (SrcOverrideTy)
3268 return SrcOverrideTy->getScalarSizeInBits();
3269 return cast<Instruction>(V)
3270 ->getOperand(0)
3271 ->getType()
3272 ->getScalarSizeInBits();
3273 };
3274
3275 unsigned MaxEltSize = 0;
3276 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3277 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3278 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3279 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3280 MaxEltSize = std::max(EltSize0, EltSize1);
3281 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3282 isa<SExtInst, ZExtInst>(Args[1])) {
3283 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3284 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3285 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3286 // enough.
3287 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3288 return nullptr;
3289 MaxEltSize = DstEltSize / 2;
3290 } else if (Opcode == Instruction::Mul &&
3291 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3292 // If one of the operands is a Zext and the other has enough zero bits
3293 // to be treated as unsigned, we can still generate a umull, meaning the
3294 // zext is free.
3295 KnownBits Known =
3296 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3297 if (Args[0]->getType()->getScalarSizeInBits() -
3298 Known.Zero.countLeadingOnes() >
3299 DstTy->getScalarSizeInBits() / 2)
3300 return nullptr;
3301
3302 MaxEltSize =
3303 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3304 } else
3305 return nullptr;
3306
3307 if (MaxEltSize * 2 > DstEltSize)
3308 return nullptr;
3309
3310 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3311 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3312 return nullptr;
3313 return ExtTy;
3314}
3315
3316// s/urhadd instructions implement the following pattern, making the
3317// extends free:
3318// %x = add ((zext i8 -> i16), 1)
3319// %y = (zext i8 -> i16)
3320// trunc i16 (lshr (add %x, %y), 1) -> i8
3321//
3323 Type *Src) const {
3324 // The source should be a legal vector type.
3325 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3326 (Src->isScalableTy() && !ST->hasSVE2()))
3327 return false;
3328
3329 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3330 return false;
3331
3332 // Look for trunc/shl/add before trying to match the pattern.
3333 const Instruction *Add = ExtUser;
3334 auto *AddUser =
3335 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3336 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3337 Add = AddUser;
3338
3339 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3340 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3341 return false;
3342
3343 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3344 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3345 Src->getScalarSizeInBits() !=
3346 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3347 return false;
3348
3349 // Try to match the whole pattern. Ext could be either the first or second
3350 // m_ZExtOrSExt matched.
3351 Instruction *Ex1, *Ex2;
3352 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3353 m_c_Add(m_Instruction(Ex2), m_One())))))
3354 return false;
3355
3356 // Ensure both extends are of the same type
3357 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3358 Ex1->getOpcode() == Ex2->getOpcode())
3359 return true;
3360
3361 return false;
3362}
3363
3365 Type *Src,
3368 const Instruction *I) const {
3369 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3370 assert(ISD && "Invalid opcode");
3371 // If the cast is observable, and it is used by a widening instruction (e.g.,
3372 // uaddl, saddw, etc.), it may be free.
3373 if (I && I->hasOneUser()) {
3374 auto *SingleUser = cast<Instruction>(*I->user_begin());
3375 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3376 if (Type *ExtTy = isBinExtWideningInstruction(
3377 SingleUser->getOpcode(), Dst, Operands,
3378 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3379 // The cost from Src->Src*2 needs to be added if required, the cost from
3380 // Src*2->ExtTy is free.
3381 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3382 Type *DoubleSrcTy =
3383 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3384 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3386 }
3387
3388 return 0;
3389 }
3390
3391 if (isSingleExtWideningInstruction(
3392 SingleUser->getOpcode(), Dst, Operands,
3393 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3394 // For adds only count the second operand as free if both operands are
3395 // extends but not the same operation. (i.e both operands are not free in
3396 // add(sext, zext)).
3397 if (SingleUser->getOpcode() == Instruction::Add) {
3398 if (I == SingleUser->getOperand(1) ||
3399 (isa<CastInst>(SingleUser->getOperand(1)) &&
3400 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3401 return 0;
3402 } else {
3403 // Others are free so long as isSingleExtWideningInstruction
3404 // returned true.
3405 return 0;
3406 }
3407 }
3408
3409 // The cast will be free for the s/urhadd instructions
3410 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3411 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3412 return 0;
3413 }
3414
3415 EVT SrcTy = TLI->getValueType(DL, Src);
3416 EVT DstTy = TLI->getValueType(DL, Dst);
3417
3418 if (!SrcTy.isSimple() || !DstTy.isSimple())
3419 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3420
3421 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3422 // we use fcvtx under SVE2. Give them invalid costs.
3423 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3424 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3425 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3427
3428 static const TypeConversionCostTblEntry BF16Tbl[] = {
3429 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3430 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3431 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3432 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3433 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3434 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3435 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3436 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3437 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3438 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3439 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3440 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3441 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3442 };
3443
3444 if (ST->hasBF16())
3445 if (const auto *Entry = ConvertCostTableLookup(
3446 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3447 return Entry->Cost;
3448
3449 // We have to estimate a cost of fixed length operation upon
3450 // SVE registers(operations) with the number of registers required
3451 // for a fixed type to be represented upon SVE registers.
3452 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3453 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3454 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3455 ST->useSVEForFixedLengthVectors(WiderTy)) {
3456 std::pair<InstructionCost, MVT> LT =
3457 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3458 unsigned NumElements =
3459 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3460 return LT.first *
3462 Opcode,
3463 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3464 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3465 CostKind, I);
3466 }
3467
3468 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3469 // The cost of unpacking twice is artificially increased for now in order
3470 // to avoid regressions against NEON, which will use tbl instructions directly
3471 // instead of multiple layers of [s|u]unpk[lo|hi].
3472 // We use the unpacks in cases where the destination type is illegal and
3473 // requires splitting of the input, even if the input type itself is legal.
3474 const unsigned int SVE_EXT_COST = 1;
3475 const unsigned int SVE_FCVT_COST = 1;
3476 const unsigned int SVE_UNPACK_ONCE = 4;
3477 const unsigned int SVE_UNPACK_TWICE = 16;
3478
3479 static const TypeConversionCostTblEntry ConversionTbl[] = {
3480 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3481 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3482 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3483 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3484 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3485 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3486 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3487 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3488 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3489 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3490 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3491 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3492 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3493 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3494 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3495 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3496 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3497 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3498 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3499 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3500
3501 // Truncations on nxvmiN
3502 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3503 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3504 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3505 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3506 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3507 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3508 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3509 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3510 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3511 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3512 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3513 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3514 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3515 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3516 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3517 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3518 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3519 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3520 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3521 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3522 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3523 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3524 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3525 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3526 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3527 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3528 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3529 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3530 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3531 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3532 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3533 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3534 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3535
3536 // The number of shll instructions for the extension.
3537 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3538 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3539 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3540 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3541 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3542 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3543 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3544 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3545 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3546 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3547 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3548 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3549 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3550 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3551 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3552 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3553
3554 // FP Ext and trunc
3555 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3556 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3557 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3558 // FP16
3559 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3560 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3561 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3562 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3563 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3564 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3565 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3566 // BF16 (uses shift)
3567 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3568 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3569 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3570 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3571 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3572 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3573 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3574 // FP Ext and trunc
3575 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3576 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3577 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3578 // FP16
3579 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3580 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3581 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3582 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3583 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3584 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3585 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3586 // BF16 (more complex, with +bf16 is handled above)
3587 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3588 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3589 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3590 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3591 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3592 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3593 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3594 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3595
3596 // LowerVectorINT_TO_FP:
3597 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3598 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3599 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3600 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3601 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3602 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3603
3604 // SVE: to nxv2f16
3605 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3606 SVE_EXT_COST + SVE_FCVT_COST},
3607 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3608 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3609 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3610 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3611 SVE_EXT_COST + SVE_FCVT_COST},
3612 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3613 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3614 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3615
3616 // SVE: to nxv4f16
3617 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3618 SVE_EXT_COST + SVE_FCVT_COST},
3619 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3620 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3622 SVE_EXT_COST + SVE_FCVT_COST},
3623 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3624 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3625
3626 // SVE: to nxv8f16
3627 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3628 SVE_EXT_COST + SVE_FCVT_COST},
3629 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3630 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3631 SVE_EXT_COST + SVE_FCVT_COST},
3632 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3633
3634 // SVE: to nxv16f16
3635 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3636 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3637 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3638 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3639
3640 // Complex: to v2f32
3641 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3642 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3643 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3644 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3645
3646 // SVE: to nxv2f32
3647 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3648 SVE_EXT_COST + SVE_FCVT_COST},
3649 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3650 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3651 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3652 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3653 SVE_EXT_COST + SVE_FCVT_COST},
3654 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3655 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3656 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3657
3658 // Complex: to v4f32
3659 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3660 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3661 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3662 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3663
3664 // SVE: to nxv4f32
3665 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3666 SVE_EXT_COST + SVE_FCVT_COST},
3667 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3668 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3669 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3670 SVE_EXT_COST + SVE_FCVT_COST},
3671 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3672 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3673
3674 // Complex: to v8f32
3675 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3676 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3677 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3678 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3679
3680 // SVE: to nxv8f32
3681 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3682 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3683 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3684 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3685 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3686 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3687 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3688 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3689
3690 // SVE: to nxv16f32
3691 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3692 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3693 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3694 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3695
3696 // Complex: to v16f32
3697 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3698 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3699
3700 // Complex: to v2f64
3701 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3702 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3703 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3704 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3705 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3706 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3707
3708 // SVE: to nxv2f64
3709 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3710 SVE_EXT_COST + SVE_FCVT_COST},
3711 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3712 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3713 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3714 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3715 SVE_EXT_COST + SVE_FCVT_COST},
3716 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3717 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3718 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3719
3720 // Complex: to v4f64
3721 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3722 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3723
3724 // SVE: to nxv4f64
3725 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3726 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3727 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3728 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3729 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3730 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3731 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3732 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3733 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3734 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3735 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3736 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3737
3738 // SVE: to nxv8f64
3739 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3740 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3741 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3742 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3743 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3744 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3745 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3746 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3747
3748 // LowerVectorFP_TO_INT
3749 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3750 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3751 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3752 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3753 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3754 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3755
3756 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3757 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3758 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3759 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3760 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3761 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3762 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3763
3764 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3765 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3766 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3767 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3768 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3769
3770 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3771 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3772 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3773 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3774 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3775 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3776 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3777
3778 // Complex, from nxv2f32.
3779 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3780 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3781 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3782 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3783 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3784 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3785 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3786 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3787
3788 // Complex, from nxv2f64.
3789 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3790 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3791 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3792 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3793 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3794 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3795 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3796 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3797 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3798 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3799
3800 // Complex, from nxv4f32.
3801 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3802 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3803 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3804 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3805 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3806 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3807 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3808 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3809 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3810 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3811
3812 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3813 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3814 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3815 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3816 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3817
3818 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3819 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3820 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3821 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3822 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3823 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3824 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3825
3826 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3827 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3828 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3829 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3830 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3831
3832 // Complex, from nxv8f16.
3833 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3834 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3835 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3836 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3837 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3838 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3839 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3840 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3841 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3842 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3843
3844 // Complex, from nxv4f16.
3845 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3846 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3847 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3848 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3849 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3850 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3851 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3852 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3853
3854 // Complex, from nxv2f16.
3855 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3856 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3857 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3858 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3859 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3860 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3861 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3862 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3863
3864 // Truncate from nxvmf32 to nxvmf16.
3865 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3866 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3867 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3868
3869 // Truncate from nxvmf32 to nxvmbf16.
3870 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3871 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3872 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3873
3874 // Truncate from nxvmf64 to nxvmf16.
3875 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3876 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3877 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3878
3879 // Truncate from nxvmf64 to nxvmbf16.
3880 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3881 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3882 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3883
3884 // Truncate from nxvmf64 to nxvmf32.
3885 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3886 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3887 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3888
3889 // Extend from nxvmf16 to nxvmf32.
3890 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3891 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3892 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3893
3894 // Extend from nxvmbf16 to nxvmf32.
3895 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3896 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3897 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3898
3899 // Extend from nxvmf16 to nxvmf64.
3900 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3901 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3902 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3903
3904 // Extend from nxvmbf16 to nxvmf64.
3905 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3906 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3907 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3908
3909 // Extend from nxvmf32 to nxvmf64.
3910 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3911 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3912 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3913
3914 // Bitcasts from float to integer
3915 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3916 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3917 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3918
3919 // Bitcasts from integer to float
3920 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3921 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3922 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3923
3924 // Add cost for extending to illegal -too wide- scalable vectors.
3925 // zero/sign extend are implemented by multiple unpack operations,
3926 // where each operation has a cost of 1.
3927 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3928 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3929 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3930 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3931 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3932 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3933
3934 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3935 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3936 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3937 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3938 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3939 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3940 };
3941
3942 if (const auto *Entry = ConvertCostTableLookup(
3943 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3944 return Entry->Cost;
3945
3946 static const TypeConversionCostTblEntry FP16Tbl[] = {
3947 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3948 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3949 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3950 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3951 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3952 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3953 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3954 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3955 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3956 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3957 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3958 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3959 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3960 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3961 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3962 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3963 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3964 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3965 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3966 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3967 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3968 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3969 };
3970
3971 if (ST->hasFullFP16())
3972 if (const auto *Entry = ConvertCostTableLookup(
3973 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3974 return Entry->Cost;
3975
3976 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3977 // double-rounding issues.
3978 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3979 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3981 return cast<FixedVectorType>(Dst)->getNumElements() *
3982 getCastInstrCost(Opcode, Dst->getScalarType(),
3983 Src->getScalarType(), CCH, CostKind) +
3985 true, CostKind) +
3987 false, CostKind);
3988
3989 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3991 ST->isSVEorStreamingSVEAvailable() &&
3992 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3994 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3996 // The standard behaviour in the backend for these cases is to split the
3997 // extend up into two parts:
3998 // 1. Perform an extending load or masked load up to the legal type.
3999 // 2. Extend the loaded data to the final type.
4000 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
4001 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
4003 Opcode, LegalTy, Src, CCH, CostKind, I);
4005 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
4006 return Part1 + Part2;
4007 }
4008
4009 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4010 // but we also want to include the TTI::CastContextHint::Masked case too.
4011 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4013 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4015
4016 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4017}
4018
4021 VectorType *VecTy, unsigned Index,
4023
4024 // Make sure we were given a valid extend opcode.
4025 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4026 "Invalid opcode");
4027
4028 // We are extending an element we extract from a vector, so the source type
4029 // of the extend is the element type of the vector.
4030 auto *Src = VecTy->getElementType();
4031
4032 // Sign- and zero-extends are for integer types only.
4033 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4034
4035 // Get the cost for the extract. We compute the cost (if any) for the extend
4036 // below.
4037 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4038 CostKind, Index, nullptr, nullptr);
4039
4040 // Legalize the types.
4041 auto VecLT = getTypeLegalizationCost(VecTy);
4042 auto DstVT = TLI->getValueType(DL, Dst);
4043 auto SrcVT = TLI->getValueType(DL, Src);
4044
4045 // If the resulting type is still a vector and the destination type is legal,
4046 // we may get the extension for free. If not, get the default cost for the
4047 // extend.
4048 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4049 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4050 CostKind);
4051
4052 // The destination type should be larger than the element type. If not, get
4053 // the default cost for the extend.
4054 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4055 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4056 CostKind);
4057
4058 switch (Opcode) {
4059 default:
4060 llvm_unreachable("Opcode should be either SExt or ZExt");
4061
4062 // For sign-extends, we only need a smov, which performs the extension
4063 // automatically.
4064 case Instruction::SExt:
4065 return Cost;
4066
4067 // For zero-extends, the extend is performed automatically by a umov unless
4068 // the destination type is i64 and the element type is i8 or i16.
4069 case Instruction::ZExt:
4070 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4071 return Cost;
4072 }
4073
4074 // If we are unable to perform the extend for free, get the default cost.
4075 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4076 CostKind);
4077}
4078
4081 const Instruction *I) const {
4083 return Opcode == Instruction::PHI ? 0 : 1;
4084 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4085 // Branches are assumed to be predicted.
4086 return 0;
4087}
4088
4089InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4090 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4091 const Instruction *I, Value *Scalar,
4092 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4093 TTI::VectorInstrContext VIC) const {
4094 assert(Val->isVectorTy() && "This must be a vector type");
4095
4096 if (Index != -1U) {
4097 // Legalize the type.
4098 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4099
4100 // This type is legalized to a scalar type.
4101 if (!LT.second.isVector())
4102 return 0;
4103
4104 // The type may be split. For fixed-width vectors we can normalize the
4105 // index to the new type.
4106 if (LT.second.isFixedLengthVector()) {
4107 unsigned Width = LT.second.getVectorNumElements();
4108 Index = Index % Width;
4109 }
4110
4111 // The element at index zero is already inside the vector.
4112 // - For a insert-element or extract-element
4113 // instruction that extracts integers, an explicit FPR -> GPR move is
4114 // needed. So it has non-zero cost.
4115 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4116 return 0;
4117
4118 // This is recognising a LD1 single-element structure to one lane of one
4119 // register instruction. I.e., if this is an `insertelement` instruction,
4120 // and its second operand is a load, then we will generate a LD1, which
4121 // are expensive instructions on some uArchs.
4122 if (VIC == TTI::VectorInstrContext::Load) {
4123 if (ST->hasFastLD1Single())
4124 return 0;
4125 return CostKind == TTI::TCK_CodeSize
4126 ? 0
4128 }
4129
4130 // i1 inserts and extract will include an extra cset or cmp of the vector
4131 // value. Increase the cost by 1 to account.
4132 if (Val->getScalarSizeInBits() == 1)
4133 return CostKind == TTI::TCK_CodeSize
4134 ? 2
4135 : ST->getVectorInsertExtractBaseCost() + 1;
4136
4137 // FIXME:
4138 // If the extract-element and insert-element instructions could be
4139 // simplified away (e.g., could be combined into users by looking at use-def
4140 // context), they have no cost. This is not done in the first place for
4141 // compile-time considerations.
4142 }
4143
4144 // In case of Neon, if there exists extractelement from lane != 0 such that
4145 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4146 // 2. extractelement result feeds into fmul.
4147 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4148 // equivalent to 0.
4149 // then the extractelement can be merged with fmul in the backend and it
4150 // incurs no cost.
4151 // e.g.
4152 // define double @foo(<2 x double> %a) {
4153 // %1 = extractelement <2 x double> %a, i32 0
4154 // %2 = extractelement <2 x double> %a, i32 1
4155 // %res = fmul double %1, %2
4156 // ret double %res
4157 // }
4158 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4159 auto ExtractCanFuseWithFmul = [&]() {
4160 // We bail out if the extract is from lane 0.
4161 if (Index == 0)
4162 return false;
4163
4164 // Check if the scalar element type of the vector operand of ExtractElement
4165 // instruction is one of the allowed types.
4166 auto IsAllowedScalarTy = [&](const Type *T) {
4167 return T->isFloatTy() || T->isDoubleTy() ||
4168 (T->isHalfTy() && ST->hasFullFP16());
4169 };
4170
4171 // Check if the extractelement user is scalar fmul.
4172 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4173 // Check if the user is scalar fmul.
4174 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4175 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4176 !BO->getType()->isVectorTy();
4177 };
4178
4179 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4180 // certain scalar type and a certain vector register width.
4181 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4182 auto RegWidth =
4184 .getFixedValue();
4185 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4186 };
4187
4188 // Check if the type constraints on input vector type and result scalar type
4189 // of extractelement instruction are satisfied.
4190 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4191 return false;
4192
4193 if (Scalar) {
4194 DenseMap<User *, unsigned> UserToExtractIdx;
4195 for (auto *U : Scalar->users()) {
4196 if (!IsUserFMulScalarTy(U))
4197 return false;
4198 // Recording entry for the user is important. Index value is not
4199 // important.
4200 UserToExtractIdx[U];
4201 }
4202 if (UserToExtractIdx.empty())
4203 return false;
4204 for (auto &[S, U, L] : ScalarUserAndIdx) {
4205 for (auto *U : S->users()) {
4206 if (UserToExtractIdx.contains(U)) {
4207 auto *FMul = cast<BinaryOperator>(U);
4208 auto *Op0 = FMul->getOperand(0);
4209 auto *Op1 = FMul->getOperand(1);
4210 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4211 UserToExtractIdx[U] = L;
4212 break;
4213 }
4214 }
4215 }
4216 }
4217 for (auto &[U, L] : UserToExtractIdx) {
4218 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4219 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4220 return false;
4221 }
4222 } else {
4223 const auto *EE = cast<ExtractElementInst>(I);
4224
4225 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4226 if (!IdxOp)
4227 return false;
4228
4229 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4230 if (!IsUserFMulScalarTy(U))
4231 return false;
4232
4233 // Check if the other operand of extractelement is also extractelement
4234 // from lane equivalent to 0.
4235 const auto *BO = cast<BinaryOperator>(U);
4236 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4237 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4238 if (OtherEE) {
4239 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4240 if (!IdxOp)
4241 return false;
4242 return IsExtractLaneEquivalentToZero(
4243 cast<ConstantInt>(OtherEE->getIndexOperand())
4244 ->getValue()
4245 .getZExtValue(),
4246 OtherEE->getType()->getScalarSizeInBits());
4247 }
4248 return true;
4249 });
4250 }
4251 return true;
4252 };
4253
4254 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4255 ExtractCanFuseWithFmul())
4256 return 0;
4257
4258 // All other insert/extracts cost this much.
4259 return CostKind == TTI::TCK_CodeSize ? 1
4260 : ST->getVectorInsertExtractBaseCost();
4261}
4262
4264 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4265 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4266 // Treat insert at lane 0 into a poison vector as having zero cost. This
4267 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4268 // single dup) are treated as cheap.
4269 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4270 isa<PoisonValue>(Op0))
4271 return 0;
4272 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4273 nullptr, {}, VIC);
4274}
4275
4277 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4278 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4279 TTI::VectorInstrContext VIC) const {
4280 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4281 ScalarUserAndIdx, VIC);
4282}
4283
4286 TTI::TargetCostKind CostKind, unsigned Index,
4287 TTI::VectorInstrContext VIC) const {
4288 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4289 nullptr, {}, VIC);
4290}
4291
4295 unsigned Index) const {
4296 if (isa<FixedVectorType>(Val))
4298 Index);
4299
4300 // This typically requires both while and lastb instructions in order
4301 // to extract the last element. If this is in a loop the while
4302 // instruction can at least be hoisted out, although it will consume a
4303 // predicate register. The cost should be more expensive than the base
4304 // extract cost, which is 2 for most CPUs.
4305 return CostKind == TTI::TCK_CodeSize
4306 ? 2
4307 : ST->getVectorInsertExtractBaseCost() + 1;
4308}
4309
4311 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4312 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4313 TTI::VectorInstrContext VIC) const {
4316 if (Ty->getElementType()->isFloatingPointTy())
4317 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4318 CostKind);
4319 unsigned VecInstCost =
4320 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4321 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4322}
4323
4324std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4326 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4327 std::function<InstructionCost(Type *)> InstCost) const {
4328 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4329 return std::nullopt;
4330 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4331 return std::nullopt;
4332 // If we have +sve-b16b16 the operation can be promoted to SVE.
4333 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4334 return std::nullopt;
4335
4336 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4337 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4339 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4340 Cost *= 2;
4341 Cost += InstCost(PromotedTy);
4342 if (IncludeTrunc)
4343 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4345 return Cost;
4346}
4347
4349 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4351 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4352
4353 // The code-generator is currently not able to handle scalable vectors
4354 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4355 // it. This change will be removed when code-generation for these types is
4356 // sufficiently reliable.
4357 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4358 if (VTy->getElementCount() == ElementCount::getScalable(1))
4360
4361 // TODO: Handle more cost kinds.
4363 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4364 Op2Info, Args, CxtI);
4365
4366 // Legalize the type.
4367 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4368 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4369
4370 // Increase the cost for half and bfloat types if not architecturally
4371 // supported.
4372 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4373 ISD == ISD::FDIV || ISD == ISD::FREM) {
4374 if (auto PromotedCost = getFP16BF16PromoteCost(
4375 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4376 // There is not native support for fdiv/frem even with +sve-b16b16.
4377 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4378 [&](Type *PromotedTy) {
4379 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4380 Op1Info, Op2Info);
4381 }))
4382 return *PromotedCost;
4383
4384 // fp128 all go via libcalls
4385 if (Ty->getScalarType()->isFP128Ty())
4386 return (CostKind == TTI::TCK_CodeSize ? 1 : 10) * LT.first;
4387 }
4388
4389 // If the operation is a widening instruction (smull or umull) and both
4390 // operands are extends the cost can be cheaper by considering that the
4391 // operation will operate on the narrowest type size possible (double the
4392 // largest input size) and a further extend.
4393 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4394 if (ExtTy != Ty)
4395 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4396 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4398 return LT.first;
4399 }
4400
4401 switch (ISD) {
4402 default:
4403 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4404 Op2Info);
4405 case ISD::ADD:
4406 case ISD::SUB:
4407 return LT.first; // Also works for i128
4408 case ISD::MUL:
4409 if (LT.second == MVT::v2i64) {
4410 // When SVE is available, then we can lower the v2i64 operation using
4411 // the SVE mul instruction, which has a lower cost.
4412 if (ST->hasSVE())
4413 return LT.first;
4414
4415 // When SVE is not available, there is no MUL.2d instruction,
4416 // which means mul <2 x i64> is expensive as elements are extracted
4417 // from the vectors and the muls scalarized.
4418 // As getScalarizationOverhead is a bit too pessimistic, we
4419 // estimate the cost for a i64 vector directly here, which is:
4420 // - four 2-cost i64 extracts,
4421 // - two 2-cost i64 inserts, and
4422 // - two 1-cost muls.
4423 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4424 // LT.first = 2 the cost is 28.
4425 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4426 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4427 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4428 nullptr, nullptr) *
4429 2 +
4430 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4431 nullptr, nullptr));
4432 }
4433 return LT.first;
4434 case ISD::SREM:
4435 case ISD::SDIV:
4436 /*
4437 Notes for sdiv/srem specific costs:
4438 1. This only considers the cases where the divisor is constant, uniform and
4439 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4440 result in some form of (ldr + adrp), corresponding to constant vectors, or
4441 scalarization of the division operation.
4442 2. Constant divisors, either negative in whole or partially, don't result in
4443 significantly different codegen as compared to positive constant divisors.
4444 So, we don't consider negative divisors separately.
4445 3. If the codegen is significantly different with SVE, it has been indicated
4446 using comments at appropriate places.
4447
4448 sdiv specific cases:
4449 -----------------------------------------------------------------------
4450 codegen | pow-of-2 | Type
4451 -----------------------------------------------------------------------
4452 add + cmp + csel + asr | Y | i64
4453 add + cmp + csel + asr | Y | i32
4454 -----------------------------------------------------------------------
4455
4456 srem specific cases:
4457 -----------------------------------------------------------------------
4458 codegen | pow-of-2 | Type
4459 -----------------------------------------------------------------------
4460 negs + and + and + csneg | Y | i64
4461 negs + and + and + csneg | Y | i32
4462 -----------------------------------------------------------------------
4463
4464 other sdiv/srem cases:
4465 -------------------------------------------------------------------------
4466 common codegen | + srem | + sdiv | pow-of-2 | Type
4467 -------------------------------------------------------------------------
4468 smulh + asr + add + add | - | - | N | i64
4469 smull + lsr + add + add | - | - | N | i32
4470 usra | and + sub | sshr | Y | <2 x i64>
4471 2 * (scalar code) | - | - | N | <2 x i64>
4472 usra | bic + sub | sshr + neg | Y | <4 x i32>
4473 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4474 + sshr + usra | | | |
4475 -------------------------------------------------------------------------
4476 */
4477 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4478 InstructionCost AddCost =
4479 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4480 Op1Info.getNoProps(), Op2Info.getNoProps());
4481 InstructionCost AsrCost =
4482 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4483 Op1Info.getNoProps(), Op2Info.getNoProps());
4484 InstructionCost MulCost =
4485 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4486 Op1Info.getNoProps(), Op2Info.getNoProps());
4487 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4488 // have similar cost.
4489 auto VT = TLI->getValueType(DL, Ty);
4490 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4491 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4492 // Neg can be folded into the asr instruction.
4493 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4494 : (3 * AsrCost + AddCost);
4495 } else {
4496 return MulCost + AsrCost + 2 * AddCost;
4497 }
4498 } else if (VT.isVector()) {
4499 InstructionCost UsraCost = 2 * AsrCost;
4500 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4501 // Division with scalable types corresponds to native 'asrd'
4502 // instruction when SVE is available.
4503 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4504
4505 // One more for the negation in SDIV
4507 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4508 if (Ty->isScalableTy() && ST->hasSVE())
4509 Cost += 2 * AsrCost;
4510 else {
4511 Cost +=
4512 UsraCost +
4513 (ISD == ISD::SDIV
4514 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4515 : 2 * AddCost);
4516 }
4517 return Cost;
4518 } else if (LT.second == MVT::v2i64) {
4519 return VT.getVectorNumElements() *
4520 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4521 Op1Info.getNoProps(),
4522 Op2Info.getNoProps());
4523 } else {
4524 // When SVE is available, we get:
4525 // smulh + lsr + add/sub + asr + add/sub.
4526 if (Ty->isScalableTy() && ST->hasSVE())
4527 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4528 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4529 }
4530 }
4531 }
4532 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4533 LT.second.isFixedLengthVector()) {
4534 // FIXME: When the constant vector is non-uniform, this may result in
4535 // loading the vector from constant pool or in some cases, may also result
4536 // in scalarization. For now, we are approximating this with the
4537 // scalarization cost.
4538 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4539 CostKind, -1, nullptr, nullptr);
4540 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4541 CostKind, -1, nullptr, nullptr);
4542 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4543 return ExtractCost + InsertCost +
4544 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4545 CostKind, Op1Info.getNoProps(),
4546 Op2Info.getNoProps());
4547 }
4548 [[fallthrough]];
4549 case ISD::UDIV:
4550 case ISD::UREM: {
4551 auto VT = TLI->getValueType(DL, Ty);
4552 if (Op2Info.isConstant()) {
4553 // If the operand is a power of 2 we can use the shift or and cost.
4554 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4555 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4556 Op1Info.getNoProps(),
4557 Op2Info.getNoProps());
4558 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4559 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4560 Op1Info.getNoProps(),
4561 Op2Info.getNoProps());
4562
4563 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4564 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4565 // The MULHU will be expanded to UMULL for the types not listed below,
4566 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4567 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4568 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4569 LT.second == MVT::nxv16i8;
4570 bool Is128bit = LT.second.is128BitVector();
4571
4572 InstructionCost MulCost =
4573 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4574 Op1Info.getNoProps(), Op2Info.getNoProps());
4575 InstructionCost AddCost =
4576 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4577 Op1Info.getNoProps(), Op2Info.getNoProps());
4578 InstructionCost ShrCost =
4579 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4580 Op1Info.getNoProps(), Op2Info.getNoProps());
4581 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4582 (HasMULH ? 0 : ShrCost) + // UMULL shift
4583 AddCost * 2 + ShrCost;
4584 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4585 }
4586 }
4587
4588 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4589 // emitted by the backend even when those functions are not declared in the
4590 // module.
4591 if (!VT.isVector() && VT.getSizeInBits() > 64)
4592 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4593
4595 Opcode, Ty, CostKind, Op1Info, Op2Info);
4596 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4597 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4598 // SDIV/UDIV operations are lowered using SVE, then we can have less
4599 // costs.
4600 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4601 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4602 static const CostTblEntry DivTbl[]{
4603 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4604 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4605 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4606 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4607 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4608 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4609
4610 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4611 if (nullptr != Entry)
4612 return Entry->Cost;
4613 }
4614 // For 8/16-bit elements, the cost is higher because the type
4615 // requires promotion and possibly splitting:
4616 if (LT.second.getScalarType() == MVT::i8)
4617 Cost *= 8;
4618 else if (LT.second.getScalarType() == MVT::i16)
4619 Cost *= 4;
4620 return Cost;
4621 } else {
4622 // If one of the operands is a uniform constant then the cost for each
4623 // element is Cost for insertion, extraction and division.
4624 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4625 // operation with scalar type
4626 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4627 (Op2Info.isConstant() && Op2Info.isUniform())) {
4628 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4630 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4631 return (4 + DivCost) * VTy->getNumElements();
4632 }
4633 }
4634 // On AArch64, without SVE, vector divisions are expanded
4635 // into scalar divisions of each pair of elements.
4636 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4637 -1, nullptr, nullptr);
4638 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4639 nullptr, nullptr);
4640 }
4641
4642 // TODO: if one of the arguments is scalar, then it's not necessary to
4643 // double the cost of handling the vector elements.
4644 Cost += Cost;
4645 }
4646 return Cost;
4647 }
4648 case ISD::XOR:
4649 case ISD::OR:
4650 case ISD::AND:
4651 case ISD::SRL:
4652 case ISD::SRA:
4653 case ISD::SHL:
4654 // These nodes are marked as 'custom' for combining purposes only.
4655 // We know that they are legal. See LowerAdd in ISelLowering.
4656 return LT.first;
4657
4658 case ISD::FNEG:
4659 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4660 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4661 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4662 CxtI &&
4663 ((CxtI->hasOneUse() &&
4664 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4665 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4666 return 0;
4667 [[fallthrough]];
4668 case ISD::FADD:
4669 case ISD::FSUB:
4670 if (!Ty->getScalarType()->isFP128Ty())
4671 return LT.first;
4672 [[fallthrough]];
4673 case ISD::FMUL:
4674 case ISD::FDIV:
4675 // These nodes are marked as 'custom' just to lower them to SVE.
4676 // We know said lowering will incur no additional cost.
4677 if (!Ty->getScalarType()->isFP128Ty())
4678 return 2 * LT.first;
4679
4680 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4681 Op2Info);
4682 case ISD::FREM:
4683 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4684 // those functions are not declared in the module.
4685 if (!Ty->isVectorTy())
4686 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4687 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4688 Op2Info);
4689 }
4690}
4691
4694 const SCEV *Ptr,
4696 // Address computations in vectorized code with non-consecutive addresses will
4697 // likely result in more instructions compared to scalar code where the
4698 // computation can more often be merged into the index mode. The resulting
4699 // extra micro-ops can significantly decrease throughput.
4700 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4701 int MaxMergeDistance = 64;
4702
4703 if (PtrTy->isVectorTy() && SE &&
4704 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4705 return NumVectorInstToHideOverhead;
4706
4707 // In many cases the address computation is not merged into the instruction
4708 // addressing mode.
4709 return 1;
4710}
4711
4712/// Check whether Opcode1 has less throughput according to the scheduling
4713/// model than Opcode2.
4715 unsigned Opcode1, unsigned Opcode2) const {
4716 const MCSchedModel &Sched = ST->getSchedModel();
4717 const TargetInstrInfo *TII = ST->getInstrInfo();
4718 if (!Sched.hasInstrSchedModel())
4719 return false;
4720
4721 const MCSchedClassDesc *SCD1 =
4722 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4723 const MCSchedClassDesc *SCD2 =
4724 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4725 // We cannot handle variant scheduling classes without an MI. If we need to
4726 // support them for any of the instructions we query the information of we
4727 // might need to add a way to resolve them without a MI or not use the
4728 // scheduling info.
4729 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4730 "Cannot handle variant scheduling classes without an MI");
4731 if (!SCD1->isValid() || !SCD2->isValid())
4732 return false;
4733
4734 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4736}
4737
4739 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4741 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4742 // We don't lower some vector selects well that are wider than the register
4743 // width. TODO: Improve this with different cost kinds.
4744 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4745 // We would need this many instructions to hide the scalarization happening.
4746 const int AmortizationCost = 20;
4747
4748 // If VecPred is not set, check if we can get a predicate from the context
4749 // instruction, if its type matches the requested ValTy.
4750 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4751 CmpPredicate CurrentPred;
4752 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4753 m_Value())))
4754 VecPred = CurrentPred;
4755 }
4756 // Check if we have a compare/select chain that can be lowered using
4757 // a (F)CMxx & BFI pair.
4758 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4759 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4760 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4761 VecPred == CmpInst::FCMP_UNE) {
4762 static const auto ValidMinMaxTys = {
4763 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4764 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4765 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4766
4767 auto LT = getTypeLegalizationCost(ValTy);
4768 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4769 (ST->hasFullFP16() &&
4770 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4771 return LT.first;
4772 }
4773
4774 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4775 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4776 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4777 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4778 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4779 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4780 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4781 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4782 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4783 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4784 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4785 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4786
4787 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4788 EVT SelValTy = TLI->getValueType(DL, ValTy);
4789 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4790 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4791 SelCondTy.getSimpleVT(),
4792 SelValTy.getSimpleVT()))
4793 return Entry->Cost;
4794 }
4795 }
4796
4797 if (Opcode == Instruction::FCmp) {
4798 if (auto PromotedCost = getFP16BF16PromoteCost(
4799 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4800 // TODO: Consider costing SVE FCMPs.
4801 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4803 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4804 CostKind, Op1Info, Op2Info);
4805 if (isa<VectorType>(PromotedTy))
4807 Instruction::Trunc,
4811 return Cost;
4812 }))
4813 return *PromotedCost;
4814
4815 auto LT = getTypeLegalizationCost(ValTy);
4816 // Model unknown fp compares as a libcall.
4817 if (LT.second.getScalarType() != MVT::f64 &&
4818 LT.second.getScalarType() != MVT::f32 &&
4819 LT.second.getScalarType() != MVT::f16)
4820 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4821 {ValTy, ValTy}, CostKind);
4822
4823 // Some comparison operators require expanding to multiple compares + or.
4824 unsigned Factor = 1;
4825 if (!CondTy->isVectorTy() &&
4826 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4827 Factor = 2; // fcmp with 2 selects
4828 else if (isa<FixedVectorType>(ValTy) &&
4829 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4830 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4831 Factor = 3; // fcmxx+fcmyy+or
4832 else if (isa<ScalableVectorType>(ValTy) &&
4833 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4834 Factor = 3; // fcmxx+fcmyy+or
4835
4836 if (isa<ScalableVectorType>(ValTy) &&
4838 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4839 AArch64::FCMEQv4f32))
4840 Factor *= 2;
4841
4842 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4843 }
4844
4845 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4846 // icmp(and, 0) as free, as we can make use of ands, but only if the
4847 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4848 // providing it will not cause performance regressions.
4849 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4850 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4851 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4852 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4853 if (match(I->getOperand(1), m_Zero()))
4854 return 0;
4855
4856 // x >= 1 / x < 1 -> x > 0 / x <= 0
4857 if (match(I->getOperand(1), m_One()) &&
4858 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4859 return 0;
4860
4861 // x <= -1 / x > -1 -> x > 0 / x <= 0
4862 if (match(I->getOperand(1), m_AllOnes()) &&
4863 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4864 return 0;
4865 }
4866
4867 // The base case handles scalable vectors fine for now, since it treats the
4868 // cost as 1 * legalization cost.
4869 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4870 Op1Info, Op2Info, I);
4871}
4872
4874AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4876 if (ST->requiresStrictAlign()) {
4877 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4878 // a bunch of instructions when strict align is enabled.
4879 return Options;
4880 }
4881 Options.AllowOverlappingLoads = true;
4882 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4883 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4884 // TODO: Though vector loads usually perform well on AArch64, in some targets
4885 // they may wake up the FP unit, which raises the power consumption. Perhaps
4886 // they could be used with no holds barred (-O3).
4887 Options.LoadSizes = {8, 4, 2, 1};
4888 Options.AllowedTailExpansions = {3, 5, 6};
4889 return Options;
4890}
4891
4893 return ST->hasSVE();
4894}
4895
4899 switch (MICA.getID()) {
4900 case Intrinsic::masked_scatter:
4901 case Intrinsic::masked_gather:
4902 return getGatherScatterOpCost(MICA, CostKind);
4903 case Intrinsic::masked_load:
4904 case Intrinsic::masked_expandload:
4905 case Intrinsic::masked_store:
4906 return getMaskedMemoryOpCost(MICA, CostKind);
4907 }
4909}
4910
4914 Type *Src = MICA.getDataType();
4915
4916 if (useNeonVector(Src))
4918 auto LT = getTypeLegalizationCost(Src);
4919 if (!LT.first.isValid())
4921
4922 // Return an invalid cost for element types that we are unable to lower.
4923 auto *VT = cast<VectorType>(Src);
4924 if (VT->getElementType()->isIntegerTy(1))
4926
4927 // The code-generator is currently not able to handle scalable vectors
4928 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4929 // it. This change will be removed when code-generation for these types is
4930 // sufficiently reliable.
4931 if (VT->getElementCount() == ElementCount::getScalable(1))
4933
4934 InstructionCost MemOpCost = LT.first;
4935 if (MICA.getID() == Intrinsic::masked_expandload) {
4936 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
4938
4939 // Operation will be split into expand of masked.load
4940 MemOpCost *= 2;
4941 }
4942
4943 // If we need to split the memory operation, we will also need to split the
4944 // mask. This will likely lead to overestimating the cost in some cases if
4945 // multiple memory operations use the same mask, but we often don't have
4946 // enough context to figure that out here.
4947 //
4948 // If the elements being loaded are bytes then the mask will already be split,
4949 // since the number of bits in a P register matches the number of bytes in a
4950 // Z register.
4951 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
4952 return MemOpCost * 2;
4953
4954 return MemOpCost;
4955}
4956
4957// This function returns gather/scatter overhead either from
4958// user-provided value or specialized values per-target from \p ST.
4959static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4960 const AArch64Subtarget *ST) {
4961 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4962 "Should be called on only load or stores.");
4963 switch (Opcode) {
4964 case Instruction::Load:
4965 if (SVEGatherOverhead.getNumOccurrences() > 0)
4966 return SVEGatherOverhead;
4967 return ST->getGatherOverhead();
4968 break;
4969 case Instruction::Store:
4970 if (SVEScatterOverhead.getNumOccurrences() > 0)
4971 return SVEScatterOverhead;
4972 return ST->getScatterOverhead();
4973 break;
4974 default:
4975 llvm_unreachable("Shouldn't have reached here");
4976 }
4977}
4978
4982
4983 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4984 MICA.getID() == Intrinsic::vp_gather)
4985 ? Instruction::Load
4986 : Instruction::Store;
4987
4988 Type *DataTy = MICA.getDataType();
4989 Align Alignment = MICA.getAlignment();
4990 const Instruction *I = MICA.getInst();
4991
4992 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4994 auto *VT = cast<VectorType>(DataTy);
4995 auto LT = getTypeLegalizationCost(DataTy);
4996 if (!LT.first.isValid())
4998
4999 // Return an invalid cost for element types that we are unable to lower.
5000 if (!LT.second.isVector() ||
5001 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
5002 VT->getElementType()->isIntegerTy(1))
5004
5005 // The code-generator is currently not able to handle scalable vectors
5006 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5007 // it. This change will be removed when code-generation for these types is
5008 // sufficiently reliable.
5009 if (VT->getElementCount() == ElementCount::getScalable(1))
5011
5012 ElementCount LegalVF = LT.second.getVectorElementCount();
5013 InstructionCost MemOpCost =
5014 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5015 {TTI::OK_AnyValue, TTI::OP_None}, I);
5016 // Add on an overhead cost for using gathers/scatters.
5017 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5018 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5019}
5020
5022 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5023}
5024
5026 Align Alignment,
5027 unsigned AddressSpace,
5029 TTI::OperandValueInfo OpInfo,
5030 const Instruction *I) const {
5031 EVT VT = TLI->getValueType(DL, Ty, true);
5032 // Type legalization can't handle structs, and load latency isn't handled here
5033 if (VT == MVT::Other ||
5034 (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency))
5035 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5036 CostKind);
5037
5038 auto LT = getTypeLegalizationCost(Ty);
5039 if (!LT.first.isValid())
5041
5042 // The code-generator is currently not able to handle scalable vectors
5043 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5044 // it. This change will be removed when code-generation for these types is
5045 // sufficiently reliable.
5046 // We also only support full register predicate loads and stores.
5047 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5048 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5049 (VTy->getElementType()->isIntegerTy(1) &&
5050 !VTy->getElementCount().isKnownMultipleOf(
5053
5054 // TODO: consider latency as well for TCK_SizeAndLatency.
5056 return LT.first;
5057
5059 return 1;
5060
5061 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5062 LT.second.is128BitVector() && Alignment < Align(16)) {
5063 // Unaligned stores are extremely inefficient. We don't split all
5064 // unaligned 128-bit stores because the negative impact that has shown in
5065 // practice on inlined block copy code.
5066 // We make such stores expensive so that we will only vectorize if there
5067 // are 6 other instructions getting vectorized.
5068 const int AmortizationCost = 6;
5069
5070 return LT.first * 2 * AmortizationCost;
5071 }
5072
5073 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5074 if (Ty->isPtrOrPtrVectorTy())
5075 return LT.first;
5076
5077 if (useNeonVector(Ty)) {
5078 // Check truncating stores and extending loads.
5079 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5080 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5081 if (VT == MVT::v4i8)
5082 return 2;
5083 // Otherwise we need to scalarize.
5084 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5085 }
5086 EVT EltVT = VT.getVectorElementType();
5087 unsigned EltSize = EltVT.getScalarSizeInBits();
5088 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5089 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5090 return LT.first;
5091 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5092 // widening to v4i8, which produces suboptimal results.
5093 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5094 return LT.first;
5095
5096 // Check non-power-of-2 loads/stores for legal vector element types with
5097 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5098 // operations on smaller power-of-2 ops, including ld1/st1.
5099 LLVMContext &C = Ty->getContext();
5101 SmallVector<EVT> TypeWorklist;
5102 TypeWorklist.push_back(VT);
5103 while (!TypeWorklist.empty()) {
5104 EVT CurrVT = TypeWorklist.pop_back_val();
5105 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5106 if (isPowerOf2_32(CurrNumElements)) {
5107 Cost += 1;
5108 continue;
5109 }
5110
5111 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5112 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5113 TypeWorklist.push_back(
5114 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5115 }
5116 return Cost;
5117 }
5118
5119 return LT.first;
5120}
5121
5123 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5124 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5125 bool UseMaskForCond, bool UseMaskForGaps) const {
5126 assert(Factor >= 2 && "Invalid interleave factor");
5127 auto *VecVTy = cast<VectorType>(VecTy);
5128
5129 if (VecTy->isScalableTy() && !ST->hasSVE())
5131
5132 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5133 // only have lowering for power-of-2 factors.
5134 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5135 // InterleavedAccessPass for ld3/st3
5136 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5138
5139 // Vectorization for masked interleaved accesses is only enabled for scalable
5140 // VF.
5141 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5143
5144 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5145 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5146 auto *SubVecTy =
5147 VectorType::get(VecVTy->getElementType(),
5148 VecVTy->getElementCount().divideCoefficientBy(Factor));
5149
5150 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5151 // Accesses having vector types that are a multiple of 128 bits can be
5152 // matched to more than one ldN/stN instruction.
5153 bool UseScalable;
5154 if (MinElts % Factor == 0 &&
5155 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5156 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5157 }
5158
5159 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5160 Alignment, AddressSpace, CostKind,
5161 UseMaskForCond, UseMaskForGaps);
5162}
5163
5168 for (auto *I : Tys) {
5169 if (!I->isVectorTy())
5170 continue;
5171 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5172 128)
5173 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5174 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5175 }
5176 return Cost;
5177}
5178
5180 Align Alignment) const {
5181 // Neon types should be scalarised when we are not choosing to use SVE.
5182 if (useNeonVector(DataTy))
5183 return false;
5184
5185 // Return true only if we are able to lower using the SVE2p2/SME2p2
5186 // expand instruction.
5187 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5188 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5189}
5190
5192 return ST->getMaxInterleaveFactor();
5193}
5194
5195// For Falkor, we want to avoid having too many strided loads in a loop since
5196// that can exhaust the HW prefetcher resources. We adjust the unroller
5197// MaxCount preference below to attempt to ensure unrolling doesn't create too
5198// many strided loads.
5199static void
5202 enum { MaxStridedLoads = 7 };
5203 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5204 int StridedLoads = 0;
5205 // FIXME? We could make this more precise by looking at the CFG and
5206 // e.g. not counting loads in each side of an if-then-else diamond.
5207 for (const auto BB : L->blocks()) {
5208 for (auto &I : *BB) {
5209 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5210 if (!LMemI)
5211 continue;
5212
5213 Value *PtrValue = LMemI->getPointerOperand();
5214 if (L->isLoopInvariant(PtrValue))
5215 continue;
5216
5217 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5218 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5219 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5220 continue;
5221
5222 // FIXME? We could take pairing of unrolled load copies into account
5223 // by looking at the AddRec, but we would probably have to limit this
5224 // to loops with no stores or other memory optimization barriers.
5225 ++StridedLoads;
5226 // We've seen enough strided loads that seeing more won't make a
5227 // difference.
5228 if (StridedLoads > MaxStridedLoads / 2)
5229 return StridedLoads;
5230 }
5231 }
5232 return StridedLoads;
5233 };
5234
5235 int StridedLoads = countStridedLoads(L, SE);
5236 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5237 << " strided loads\n");
5238 // Pick the largest power of 2 unroll count that won't result in too many
5239 // strided loads.
5240 if (StridedLoads) {
5241 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5242 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5243 << UP.MaxCount << '\n');
5244 }
5245}
5246
5247// This function returns true if the loop:
5248// 1. Has a valid cost, and
5249// 2. Has a cost within the supplied budget.
5250// Otherwise it returns false.
5252 InstructionCost Budget,
5253 unsigned *FinalSize) {
5254 // Estimate the size of the loop.
5255 InstructionCost LoopCost = 0;
5256
5257 for (auto *BB : L->getBlocks()) {
5258 for (auto &I : *BB) {
5259 SmallVector<const Value *, 4> Operands(I.operand_values());
5260 InstructionCost Cost =
5261 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5262 // This can happen with intrinsics that don't currently have a cost model
5263 // or for some operations that require SVE.
5264 if (!Cost.isValid())
5265 return false;
5266
5267 LoopCost += Cost;
5268 if (LoopCost > Budget)
5269 return false;
5270 }
5271 }
5272
5273 if (FinalSize)
5274 *FinalSize = LoopCost.getValue();
5275 return true;
5276}
5277
5279 const AArch64TTIImpl &TTI) {
5280 // Only consider loops with unknown trip counts for which we can determine
5281 // a symbolic expression. Multi-exit loops with small known trip counts will
5282 // likely be unrolled anyway.
5283 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5285 return false;
5286
5287 // It might not be worth unrolling loops with low max trip counts. Restrict
5288 // this to max trip counts > 32 for now.
5289 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5290 if (MaxTC > 0 && MaxTC <= 32)
5291 return false;
5292
5293 // Make sure the loop size is <= 5.
5294 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5295 return false;
5296
5297 // Small search loops with multiple exits can be highly beneficial to unroll.
5298 // We only care about loops with exactly two exiting blocks, although each
5299 // block could jump to the same exit block.
5300 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5301 if (Blocks.size() != 2)
5302 return false;
5303
5304 if (any_of(Blocks, [](BasicBlock *BB) {
5306 }))
5307 return false;
5308
5309 return true;
5310}
5311
5312/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5313/// OOO engine's wide instruction window and various predictors.
5314static void
5317 const AArch64TTIImpl &TTI) {
5318 // Limit loops with structure that is highly likely to benefit from runtime
5319 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5320 // likely with complex control flow). Note that the heuristics here may be
5321 // overly conservative and we err on the side of avoiding runtime unrolling
5322 // rather than unroll excessively. They are all subject to further refinement.
5323 if (!L->isInnermost() || L->getNumBlocks() > 8)
5324 return;
5325
5326 // Loops with multiple exits are handled by common code.
5327 if (!L->getExitBlock())
5328 return;
5329
5330 // Check if the loop contains any reductions that could be parallelized when
5331 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5332 // a multiple of 2.
5333 bool HasParellelizableReductions =
5334 L->getNumBlocks() == 1 &&
5335 any_of(L->getHeader()->phis(),
5336 [&SE, L](PHINode &Phi) {
5337 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5338 }) &&
5339 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5340 if (HasParellelizableReductions &&
5341 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5342 UP.Partial = true;
5343 UP.MaxCount = 4;
5344 UP.AddAdditionalAccumulators = true;
5345 }
5346
5347 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5349 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5350 SE.getSmallConstantMaxTripCount(L) <= 32))
5351 return;
5352
5353 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5354 return;
5355
5357 return;
5358
5359 // Limit to loops with trip counts that are cheap to expand.
5360 UP.SCEVExpansionBudget = 1;
5361
5362 if (HasParellelizableReductions) {
5363 UP.Runtime = true;
5365 UP.AddAdditionalAccumulators = true;
5366 }
5367
5368 // Try to unroll small loops, of few-blocks with low budget, if they have
5369 // load/store dependencies, to expose more parallel memory access streams,
5370 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5371 BasicBlock *Header = L->getHeader();
5372 BasicBlock *Latch = L->getLoopLatch();
5373 if (Header == Latch) {
5374 // Estimate the size of the loop.
5375 unsigned Size;
5376 unsigned Width = 10;
5377 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5378 return;
5379
5380 // Try to find an unroll count that maximizes the use of the instruction
5381 // window, i.e. trying to fetch as many instructions per cycle as possible.
5382 unsigned MaxInstsPerLine = 16;
5383 unsigned UC = 1;
5384 unsigned BestUC = 1;
5385 unsigned SizeWithBestUC = BestUC * Size;
5386 while (UC <= 8) {
5387 unsigned SizeWithUC = UC * Size;
5388 if (SizeWithUC > 48)
5389 break;
5390 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5391 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5392 BestUC = UC;
5393 SizeWithBestUC = BestUC * Size;
5394 }
5395 UC++;
5396 }
5397
5398 if (BestUC == 1)
5399 return;
5400
5401 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5403 for (auto *BB : L->blocks()) {
5404 for (auto &I : *BB) {
5406 if (!Ptr)
5407 continue;
5408 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5409 if (SE.isLoopInvariant(PtrSCEV, L))
5410 continue;
5411 if (isa<LoadInst>(&I)) {
5412 LoadedValuesPlus.insert(&I);
5413 // Include in-loop 1st users of loaded values.
5414 for (auto *U : I.users())
5415 if (L->contains(cast<Instruction>(U)))
5416 LoadedValuesPlus.insert(U);
5417 } else
5418 Stores.push_back(cast<StoreInst>(&I));
5419 }
5420 }
5421
5422 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5423 return LoadedValuesPlus.contains(SI->getOperand(0));
5424 }))
5425 return;
5426
5427 UP.Runtime = true;
5428 UP.DefaultUnrollRuntimeCount = BestUC;
5429 return;
5430 }
5431
5432 // Try to runtime-unroll loops with early-continues depending on loop-varying
5433 // loads; this helps with branch-prediction for the early-continues.
5434 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5436 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5437 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5438 return;
5439
5440 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5441 [&](Instruction *I, unsigned Depth) -> bool {
5442 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5443 return false;
5444
5445 if (isa<LoadInst>(I))
5446 return true;
5447
5448 return any_of(I->operands(), [&](Value *V) {
5449 auto *I = dyn_cast<Instruction>(V);
5450 return I && DependsOnLoopLoad(I, Depth + 1);
5451 });
5452 };
5453 CmpPredicate Pred;
5454 Instruction *I;
5455 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5456 m_Value())) &&
5457 DependsOnLoopLoad(I, 0)) {
5458 UP.Runtime = true;
5459 }
5460}
5461
5464 OptimizationRemarkEmitter *ORE) const {
5465 // Enable partial unrolling and runtime unrolling.
5466 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5467
5468 UP.UpperBound = true;
5469
5470 // For inner loop, it is more likely to be a hot one, and the runtime check
5471 // can be promoted out from LICM pass, so the overhead is less, let's try
5472 // a larger threshold to unroll more loops.
5473 if (L->getLoopDepth() > 1)
5474 UP.PartialThreshold *= 2;
5475
5476 // Disable partial & runtime unrolling on -Os.
5478
5479 // Scan the loop: don't unroll loops with calls as this could prevent
5480 // inlining. Don't unroll auto-vectorized loops either, though do allow
5481 // unrolling of the scalar remainder.
5482 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5484 for (auto *BB : L->getBlocks()) {
5485 for (auto &I : *BB) {
5486 // Both auto-vectorized loops and the scalar remainder have the
5487 // isvectorized attribute, so differentiate between them by the presence
5488 // of vector instructions.
5489 if (IsVectorized && I.getType()->isVectorTy())
5490 return;
5491 if (isa<CallBase>(I)) {
5494 if (!isLoweredToCall(F))
5495 continue;
5496 return;
5497 }
5498
5499 SmallVector<const Value *, 4> Operands(I.operand_values());
5500 Cost += getInstructionCost(&I, Operands,
5502 }
5503 }
5504
5505 // Apply subtarget-specific unrolling preferences.
5506 if (ST->isAppleMLike())
5507 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5508 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5511
5512 // If this is a small, multi-exit loop similar to something like std::find,
5513 // then there is typically a performance improvement achieved by unrolling.
5514 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5515 UP.RuntimeUnrollMultiExit = true;
5516 UP.Runtime = true;
5517 // Limit unroll count.
5519 // Allow slightly more costly trip-count expansion to catch search loops
5520 // with pointer inductions.
5521 UP.SCEVExpansionBudget = 5;
5522 return;
5523 }
5524
5525 // Enable runtime unrolling for in-order models
5526 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5527 // checking for that case, we can ensure that the default behaviour is
5528 // unchanged
5529 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5530 !ST->getSchedModel().isOutOfOrder()) {
5531 UP.Runtime = true;
5532 UP.Partial = true;
5533 UP.UnrollRemainder = true;
5535
5536 UP.UnrollAndJam = true;
5538 }
5539
5540 // Force unrolling small loops can be very useful because of the branch
5541 // taken cost of the backedge.
5543 UP.Force = true;
5544}
5545
5550
5552 Type *ExpectedType,
5553 bool CanCreate) const {
5554 switch (Inst->getIntrinsicID()) {
5555 default:
5556 return nullptr;
5557 case Intrinsic::aarch64_neon_st1x2:
5558 case Intrinsic::aarch64_neon_st1x3:
5559 case Intrinsic::aarch64_neon_st1x4:
5560 case Intrinsic::aarch64_neon_st2:
5561 case Intrinsic::aarch64_neon_st3:
5562 case Intrinsic::aarch64_neon_st4: {
5563 // Create a struct type
5564 StructType *ST = dyn_cast<StructType>(ExpectedType);
5565 if (!CanCreate || !ST)
5566 return nullptr;
5567 unsigned NumElts = Inst->arg_size() - 1;
5568 if (ST->getNumElements() != NumElts)
5569 return nullptr;
5570 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5571 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5572 return nullptr;
5573 }
5574 Value *Res = PoisonValue::get(ExpectedType);
5575 IRBuilder<> Builder(Inst);
5576 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5577 Value *L = Inst->getArgOperand(i);
5578 Res = Builder.CreateInsertValue(Res, L, i);
5579 }
5580 return Res;
5581 }
5582 case Intrinsic::aarch64_neon_ld1x2:
5583 case Intrinsic::aarch64_neon_ld1x3:
5584 case Intrinsic::aarch64_neon_ld1x4:
5585 case Intrinsic::aarch64_neon_ld2:
5586 case Intrinsic::aarch64_neon_ld3:
5587 case Intrinsic::aarch64_neon_ld4:
5588 if (Inst->getType() == ExpectedType)
5589 return Inst;
5590 return nullptr;
5591 }
5592}
5593
5595 MemIntrinsicInfo &Info) const {
5596 switch (Inst->getIntrinsicID()) {
5597 default:
5598 break;
5599 case Intrinsic::aarch64_neon_ld1x2:
5600 case Intrinsic::aarch64_neon_ld1x3:
5601 case Intrinsic::aarch64_neon_ld1x4:
5602 case Intrinsic::aarch64_neon_ld2:
5603 case Intrinsic::aarch64_neon_ld3:
5604 case Intrinsic::aarch64_neon_ld4:
5605 Info.ReadMem = true;
5606 Info.WriteMem = false;
5607 Info.PtrVal = Inst->getArgOperand(0);
5608 break;
5609 case Intrinsic::aarch64_neon_st1x2:
5610 case Intrinsic::aarch64_neon_st1x3:
5611 case Intrinsic::aarch64_neon_st1x4:
5612 case Intrinsic::aarch64_neon_st2:
5613 case Intrinsic::aarch64_neon_st3:
5614 case Intrinsic::aarch64_neon_st4:
5615 Info.ReadMem = false;
5616 Info.WriteMem = true;
5617 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5618 break;
5619 }
5620
5621 // Use the ID of neon load as the "matching id".
5622 switch (Inst->getIntrinsicID()) {
5623 default:
5624 return false;
5625 case Intrinsic::aarch64_neon_ld1x2:
5626 case Intrinsic::aarch64_neon_st1x2:
5627 Info.MatchingId = Intrinsic::aarch64_neon_ld1x2;
5628 break;
5629 case Intrinsic::aarch64_neon_ld1x3:
5630 case Intrinsic::aarch64_neon_st1x3:
5631 Info.MatchingId = Intrinsic::aarch64_neon_ld1x3;
5632 break;
5633 case Intrinsic::aarch64_neon_ld1x4:
5634 case Intrinsic::aarch64_neon_st1x4:
5635 Info.MatchingId = Intrinsic::aarch64_neon_ld1x4;
5636 break;
5637 case Intrinsic::aarch64_neon_ld2:
5638 case Intrinsic::aarch64_neon_st2:
5639 Info.MatchingId = Intrinsic::aarch64_neon_ld2;
5640 break;
5641 case Intrinsic::aarch64_neon_ld3:
5642 case Intrinsic::aarch64_neon_st3:
5643 Info.MatchingId = Intrinsic::aarch64_neon_ld3;
5644 break;
5645 case Intrinsic::aarch64_neon_ld4:
5646 case Intrinsic::aarch64_neon_st4:
5647 Info.MatchingId = Intrinsic::aarch64_neon_ld4;
5648 break;
5649 }
5650 return true;
5651}
5652
5653/// See if \p I should be considered for address type promotion. We check if \p
5654/// I is a sext with right type and used in memory accesses. If it used in a
5655/// "complex" getelementptr, we allow it to be promoted without finding other
5656/// sext instructions that sign extended the same initial value. A getelementptr
5657/// is considered as "complex" if it has more than 2 operands.
5659 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5660 bool Considerable = false;
5661 AllowPromotionWithoutCommonHeader = false;
5662 if (!isa<SExtInst>(&I))
5663 return false;
5664 Type *ConsideredSExtType =
5665 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5666 if (I.getType() != ConsideredSExtType)
5667 return false;
5668 // See if the sext is the one with the right type and used in at least one
5669 // GetElementPtrInst.
5670 for (const User *U : I.users()) {
5671 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5672 Considerable = true;
5673 // A getelementptr is considered as "complex" if it has more than 2
5674 // operands. We will promote a SExt used in such complex GEP as we
5675 // expect some computation to be merged if they are done on 64 bits.
5676 if (GEPInst->getNumOperands() > 2) {
5677 AllowPromotionWithoutCommonHeader = true;
5678 break;
5679 }
5680 }
5681 }
5682 return Considerable;
5683}
5684
5686 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5687 if (!VF.isScalable())
5688 return true;
5689
5690 Type *Ty = RdxDesc.getRecurrenceType();
5691 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5692 return false;
5693
5694 switch (RdxDesc.getRecurrenceKind()) {
5695 case RecurKind::Sub:
5696 case RecurKind::FSub:
5699 case RecurKind::Add:
5700 case RecurKind::FAdd:
5701 case RecurKind::And:
5702 case RecurKind::Or:
5703 case RecurKind::Xor:
5704 case RecurKind::SMin:
5705 case RecurKind::SMax:
5706 case RecurKind::UMin:
5707 case RecurKind::UMax:
5708 case RecurKind::FMin:
5709 case RecurKind::FMax:
5710 case RecurKind::FMulAdd:
5711 case RecurKind::AnyOf:
5713 return true;
5714 default:
5715 return false;
5716 }
5717}
5718
5721 FastMathFlags FMF,
5723 // The code-generator is currently not able to handle scalable vectors
5724 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5725 // it. This change will be removed when code-generation for these types is
5726 // sufficiently reliable.
5727 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5728 if (VTy->getElementCount() == ElementCount::getScalable(1))
5730
5731 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5732
5733 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5734 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5735
5736 InstructionCost LegalizationCost = 0;
5737 if (LT.first > 1) {
5738 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5739 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5740 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5741 }
5742
5743 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5744}
5745
5747 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5748 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5749 InstructionCost LegalizationCost = 0;
5750 if (LT.first > 1) {
5751 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5752 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5753 LegalizationCost *= LT.first - 1;
5754 }
5755
5756 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5757 assert(ISD && "Invalid opcode");
5758 // Add the final reduction cost for the legal horizontal reduction
5759 switch (ISD) {
5760 case ISD::ADD:
5761 case ISD::AND:
5762 case ISD::OR:
5763 case ISD::XOR:
5764 case ISD::FADD:
5765 return LegalizationCost + 2;
5766 default:
5768 }
5769}
5770
5773 std::optional<FastMathFlags> FMF,
5775 // The code-generator is currently not able to handle scalable vectors
5776 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5777 // it. This change will be removed when code-generation for these types is
5778 // sufficiently reliable.
5779 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5780 if (VTy->getElementCount() == ElementCount::getScalable(1))
5782
5784 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5785 InstructionCost BaseCost =
5786 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5787 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5788 // end up vectorizing for more computationally intensive loops.
5789 return BaseCost + FixedVTy->getNumElements();
5790 }
5791
5792 if (Opcode != Instruction::FAdd || ValTy->getElementType()->isBFloatTy())
5794
5795 auto *VTy = cast<ScalableVectorType>(ValTy);
5797 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5798 Cost *= getMaxNumElements(VTy->getElementCount());
5799 return Cost;
5800 }
5801
5802 if (isa<ScalableVectorType>(ValTy))
5803 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5804
5805 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5806 MVT MTy = LT.second;
5807 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5808 assert(ISD && "Invalid opcode");
5809
5810 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5811 // instructions as twice a normal vector add, plus 1 for each legalization
5812 // step (LT.first). This is the only arithmetic vector reduction operation for
5813 // which we have an instruction.
5814 // OR, XOR and AND costs should match the codegen from:
5815 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5816 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5817 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5818 static const CostTblEntry CostTblNoPairwise[]{
5819 {ISD::ADD, MVT::v8i8, 2},
5820 {ISD::ADD, MVT::v16i8, 2},
5821 {ISD::ADD, MVT::v4i16, 2},
5822 {ISD::ADD, MVT::v8i16, 2},
5823 {ISD::ADD, MVT::v2i32, 2},
5824 {ISD::ADD, MVT::v4i32, 2},
5825 {ISD::ADD, MVT::v2i64, 2},
5826 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5827 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5828 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5829 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5830 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5831 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5832 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5833 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5834 {ISD::XOR, MVT::v16i8, 7},
5835 {ISD::XOR, MVT::v4i16, 4},
5836 {ISD::XOR, MVT::v8i16, 6},
5837 {ISD::XOR, MVT::v2i32, 3},
5838 {ISD::XOR, MVT::v4i32, 5},
5839 {ISD::XOR, MVT::v2i64, 3},
5840 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5841 {ISD::AND, MVT::v16i8, 7},
5842 {ISD::AND, MVT::v4i16, 4},
5843 {ISD::AND, MVT::v8i16, 6},
5844 {ISD::AND, MVT::v2i32, 3},
5845 {ISD::AND, MVT::v4i32, 5},
5846 {ISD::AND, MVT::v2i64, 3},
5847 };
5848 switch (ISD) {
5849 default:
5850 break;
5851 case ISD::FADD:
5852 if (Type *EltTy = ValTy->getScalarType();
5853 // FIXME: For half types without fullfp16 support, this could extend and
5854 // use a fp32 faddp reduction but current codegen unrolls.
5855 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5856 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5857 const unsigned NElts = MTy.getVectorNumElements();
5858 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5859 isPowerOf2_32(NElts))
5860 // Reduction corresponding to series of fadd instructions is lowered to
5861 // series of faddp instructions. faddp has latency/throughput that
5862 // matches fadd instruction and hence, every faddp instruction can be
5863 // considered to have a relative cost = 1 with
5864 // CostKind = TCK_RecipThroughput.
5865 // An faddp will pairwise add vector elements, so the size of input
5866 // vector reduces by half every time, requiring
5867 // #(faddp instructions) = log2_32(NElts).
5868 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5869 }
5870 break;
5871 case ISD::ADD:
5872 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5873 return (LT.first - 1) + Entry->Cost;
5874 break;
5875 case ISD::XOR:
5876 case ISD::AND:
5877 case ISD::OR:
5878 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5879 if (!Entry)
5880 break;
5881 auto *ValVTy = cast<FixedVectorType>(ValTy);
5882 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5883 isPowerOf2_32(ValVTy->getNumElements())) {
5884 InstructionCost ExtraCost = 0;
5885 if (LT.first != 1) {
5886 // Type needs to be split, so there is an extra cost of LT.first - 1
5887 // arithmetic ops.
5888 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5889 MTy.getVectorNumElements());
5890 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5891 ExtraCost *= LT.first - 1;
5892 }
5893 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5894 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5895 return Cost + ExtraCost;
5896 }
5897 break;
5898 }
5899 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5900}
5901
5903 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5904 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5905 EVT VecVT = TLI->getValueType(DL, VecTy);
5906 EVT ResVT = TLI->getValueType(DL, ResTy);
5907
5908 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5909 VecVT.getSizeInBits() >= 64) {
5910 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5911
5912 // The legal cases are:
5913 // UADDLV 8/16/32->32
5914 // UADDLP 32->64
5915 unsigned RevVTSize = ResVT.getSizeInBits();
5916 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5917 RevVTSize <= 32) ||
5918 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5919 RevVTSize <= 32) ||
5920 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5921 RevVTSize <= 64))
5922 return (LT.first - 1) * 2 + 2;
5923 }
5924
5925 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5926 CostKind);
5927}
5928
5930AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5931 Type *ResTy, VectorType *VecTy,
5933 EVT VecVT = TLI->getValueType(DL, VecTy);
5934 EVT ResVT = TLI->getValueType(DL, ResTy);
5935
5936 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5937 RedOpcode == Instruction::Add) {
5938 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5939
5940 // The legal cases with dotprod are
5941 // UDOT 8->32
5942 // Which requires an additional uaddv to sum the i32 values.
5943 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5944 ResVT == MVT::i32)
5945 return LT.first + 2;
5946 }
5947
5948 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5949 CostKind);
5950}
5951
5955 static const CostTblEntry ShuffleTbl[] = {
5956 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5957 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5958 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5959 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5960 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5961 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5962 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5963 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5964 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5965 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5966 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5967 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5968 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5969 };
5970
5971 // The code-generator is currently not able to handle scalable vectors
5972 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5973 // it. This change will be removed when code-generation for these types is
5974 // sufficiently reliable.
5977
5978 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5979 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5980 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5981 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5982 : LT.second;
5983 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5984 InstructionCost LegalizationCost = 0;
5985 if (Index < 0) {
5986 LegalizationCost =
5987 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5989 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5991 }
5992
5993 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5994 // Cost performed on a promoted type.
5995 if (LT.second.getScalarType() == MVT::i1) {
5996 LegalizationCost +=
5997 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5999 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
6001 }
6002 const auto *Entry =
6003 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
6004 assert(Entry && "Illegal Type for Splice");
6005 LegalizationCost += Entry->Cost;
6006 return LegalizationCost * LT.first;
6007}
6008
6010 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
6012 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
6013 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
6015
6017 return Invalid;
6018
6019 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
6020 Opcode != Instruction::FAdd && Opcode != Instruction::FSub) ||
6021 OpAExtend == TTI::PR_None)
6022 return Invalid;
6023
6024 // Floating-point partial reductions are invalid if `reassoc` and `contract`
6025 // are not allowed.
6026 if (AccumType->isFloatingPointTy()) {
6027 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
6028 if (!FMF->allowReassoc() || !FMF->allowContract())
6029 return Invalid;
6030 } else {
6031 assert(!FMF &&
6032 "FastMathFlags only apply to floating-point partial reductions");
6033 }
6034
6035 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
6036 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
6037 "Unexpected values for OpBExtend or InputTypeB");
6038
6039 // We only support multiply binary operations for now, and for muls we
6040 // require the types being extended to be the same.
6041 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6042 InputTypeA != InputTypeB))
6043 return Invalid;
6044
6045 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6046 // USDot is natively supported with +i8mm. With plain +dotprod, SUMLA is
6047 // lowered to two udots plus an eor and a sub.
6048 if (IsUSDot && !ST->hasMatMulInt8() && !ST->hasDotProd())
6049 // FIXME: Remove this early bailout in favour of expand cost.
6050 return Invalid;
6051
6052 unsigned Ratio =
6053 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6054 if (VF.getKnownMinValue() <= Ratio)
6055 return Invalid;
6056
6057 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6058 VectorType *AccumVectorType =
6059 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6060 // We don't yet support all kinds of legalization.
6061 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6062 EVT::getEVT(AccumVectorType));
6063 switch (TC.first) {
6064 default:
6065 return Invalid;
6069 // The legalised type (e.g. after splitting) must be legal too.
6070 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6072 return Invalid;
6073 break;
6074 }
6075
6076 std::pair<InstructionCost, MVT> AccumLT =
6077 getTypeLegalizationCost(AccumVectorType);
6078 std::pair<InstructionCost, MVT> InputLT =
6079 getTypeLegalizationCost(InputVectorType);
6080
6081 // Returns true if the subtarget supports the operation for a given type.
6082 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6083 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6084 (AccumLT.second.isFixedLengthVector() &&
6085 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6086 NEONPred);
6087 };
6088
6089 bool IsSub = Opcode == Instruction::Sub || Opcode == Instruction::FSub;
6090 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6091 // Integer partial sub-reductions that don't map to a specific instruction,
6092 // carry an extra cost for implementing a double negation:
6093 // partial_reduce_umls acc, lhs, rhs
6094 // <=> -partial_reduce_umla -acc, lhs, rhs
6095 InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0;
6096
6097 if (AccumLT.second.getScalarType() == MVT::i32 &&
6098 InputLT.second.getScalarType() == MVT::i8) {
6099 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6100 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6101 return Cost + INegCost;
6102 // i8 -> i32 usdot requires +i8mm
6103 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6104 return Cost + INegCost;
6105 // Without +i8mm, lower SUMLA via two udots plus an eor and a sub on plain
6106 // +dotprod targets. Note that this is only implemented for NEON, as all
6107 // modern CPUs with SVE also have +i8mm. Charge an extra factor for the
6108 // expansion.
6109 if (IsUSDot && IsSupported(false, ST->hasDotProd()))
6110 return Cost * 3 + INegCost;
6111 }
6112
6113 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6114 // i16 -> i64 is natively supported for udot/sdot
6115 if (AccumLT.second.getScalarType() == MVT::i64 &&
6116 InputLT.second.getScalarType() == MVT::i16)
6117 return Cost + INegCost;
6118 // i16 -> i32 is natively supported with SVE2p1 udot/sdot.
6119 // For sub-reductions, we prefer using the *mlslb/t instructions.
6120 if (AccumLT.second.getScalarType() == MVT::i32 &&
6121 InputLT.second.getScalarType() == MVT::i16 &&
6122 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6123 return Cost;
6124 // i8 -> i64 is supported with an extra level of extends
6125 if (AccumLT.second.getScalarType() == MVT::i64 &&
6126 InputLT.second.getScalarType() == MVT::i8)
6127 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6128 // because it requires two extra extends on the inputs. But if we'd change
6129 // that now, a regular reduction would be cheaper because the costs of
6130 // the extends in the IR are still counted. This can be fixed
6131 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6132 return Cost + INegCost;
6133 // i8 -> i16 is natively supported with SVE2p3 udot/sdot
6134 // For sub-reductions, we prefer using the *mlslb/t instructions.
6135 if (AccumLT.second.getScalarType() == MVT::i16 &&
6136 InputLT.second.getScalarType() == MVT::i8 &&
6137 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6138 return Cost;
6139 }
6140
6141 // f16 -> f32 is natively supported for fdot using either
6142 // SVE or NEON instruction.
6143 if (Opcode == Instruction::FAdd && !IsSub &&
6144 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6145 AccumLT.second.getScalarType() == MVT::f32 &&
6146 InputLT.second.getScalarType() == MVT::f16)
6147 return Cost;
6148
6149 // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions.
6150 if (Ratio == 2 && !IsUSDot) {
6151 MVT InVT = InputLT.second.getScalarType();
6152
6153 // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2)
6154 if (IsSupported(ST->hasSVE2() || ST->hasSME(), true) &&
6155 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6156 return Cost * 2;
6157
6158 // SVE2 fml[as]lb/t and NEON fml[as]l(2)
6159 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6160 return Cost * 2;
6161
6162 // SME2/SVE2p1 bfmlslb/t
6163 if (IsSupported(ST->hasSVE2p1() || ST->hasSME2(), false) &&
6164 InVT == MVT::bf16 && IsSub)
6165 return Cost * 2;
6166
6167 // FP partial sub-reductions that don't map to a specific instruction,
6168 // carry an extra cost for implementing an extra negation:
6169 // partial_reduce_fmls acc, lhs, rhs
6170 // <=> partial_reduce_fmla acc, lhs, -rhs
6171 InstructionCost FNegCost = IsSub ? InputLT.first * TTI::TCC_Basic : 0;
6172
6173 // SVE and NEON bfmlalb/t
6174 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6175 return Cost * 2 + FNegCost;
6176 }
6177
6178 return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
6179 AccumType, VF, OpAExtend, OpBExtend,
6180 BinOp, CostKind, FMF);
6181}
6182
6185 VectorType *SrcTy, ArrayRef<int> Mask,
6186 TTI::TargetCostKind CostKind, int Index,
6188 const Instruction *CxtI) const {
6189 assert((Mask.empty() || DstTy->isScalableTy() ||
6190 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6191 "Expected the Mask to match the return size if given");
6192 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6193 "Expected the same scalar types");
6194 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6195
6196 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6197 // into smaller vectors and sum the cost of each shuffle.
6198 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6199 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6200 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6201 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6202 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6203 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6204 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6205 // cost than just the load.
6206 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6209 return std::max<InstructionCost>(1, LT.first / 4);
6210
6211 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6212 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6213 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6214 // cost than just the store.
6215 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6217 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6219 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6220 return LT.first;
6221
6222 unsigned TpNumElts = Mask.size();
6223 unsigned LTNumElts = LT.second.getVectorNumElements();
6224 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6225 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6226 LT.second.getVectorElementCount());
6228 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6229 PreviousCosts;
6230 for (unsigned N = 0; N < NumVecs; N++) {
6231 SmallVector<int> NMask;
6232 // Split the existing mask into chunks of size LTNumElts. Track the source
6233 // sub-vectors to ensure the result has at most 2 inputs.
6234 unsigned Source1 = -1U, Source2 = -1U;
6235 unsigned NumSources = 0;
6236 for (unsigned E = 0; E < LTNumElts; E++) {
6237 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6239 if (MaskElt < 0) {
6241 continue;
6242 }
6243
6244 // Calculate which source from the input this comes from and whether it
6245 // is new to us.
6246 unsigned Source = MaskElt / LTNumElts;
6247 if (NumSources == 0) {
6248 Source1 = Source;
6249 NumSources = 1;
6250 } else if (NumSources == 1 && Source != Source1) {
6251 Source2 = Source;
6252 NumSources = 2;
6253 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6254 NumSources++;
6255 }
6256
6257 // Add to the new mask. For the NumSources>2 case these are not correct,
6258 // but are only used for the modular lane number.
6259 if (Source == Source1)
6260 NMask.push_back(MaskElt % LTNumElts);
6261 else if (Source == Source2)
6262 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6263 else
6264 NMask.push_back(MaskElt % LTNumElts);
6265 }
6266 // Check if we have already generated this sub-shuffle, which means we
6267 // will have already generated the output. For example a <16 x i32> splat
6268 // will be the same sub-splat 4 times, which only needs to be generated
6269 // once and reused.
6270 auto Result =
6271 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6272 // Check if it was already in the map (already costed).
6273 if (!Result.second)
6274 continue;
6275 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6276 // getShuffleCost. If not then cost it using the worst case as the number
6277 // of element moves into a new vector.
6278 InstructionCost NCost =
6279 NumSources <= 2
6280 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6282 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6283 CxtI)
6284 : LTNumElts;
6285 Result.first->second = NCost;
6286 Cost += NCost;
6287 }
6288 return Cost;
6289 }
6290
6291 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6292 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6293 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6294 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6295 // This currently only handles low or high extracts to prevent SLP vectorizer
6296 // regressions.
6297 // Note that SVE's ext instruction is destructive, but it can be fused with
6298 // a movprfx to act like a constructive instruction.
6299 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6300 if (LT.second.getFixedSizeInBits() >= 128 &&
6301 cast<FixedVectorType>(SubTp)->getNumElements() ==
6302 LT.second.getVectorNumElements() / 2) {
6303 if (Index == 0)
6304 return 0;
6305 if (Index == (int)LT.second.getVectorNumElements() / 2)
6306 return 1;
6307 }
6309 }
6310 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6311 // the code to handle length-changing shuffles.
6312 if (Kind == TTI::SK_InsertSubvector) {
6313 LT = getTypeLegalizationCost(DstTy);
6314 SrcTy = DstTy;
6315 }
6316
6317 // Check for identity masks, which we can treat as free for both fixed and
6318 // scalable vector paths.
6319 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6320 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6321 all_of(enumerate(Mask), [](const auto &M) {
6322 return M.value() < 0 || M.value() == (int)M.index();
6323 }))
6324 return 0;
6325
6326 // Segmented shuffle matching.
6327 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6328 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6329 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6331
6333 unsigned Segments =
6335 unsigned SegmentElts = VTy->getNumElements() / Segments;
6336
6337 // dupq zd.t, zn.t[idx]
6338 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6339 ST->isSVEorStreamingSVEAvailable() &&
6340 isDUPQMask(Mask, Segments, SegmentElts))
6341 return LT.first;
6342
6343 // mov zd.q, vn
6344 if (ST->isSVEorStreamingSVEAvailable() &&
6345 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6346 return LT.first;
6347 }
6348
6349 // Check for broadcast loads, which are supported by the LD1R instruction.
6350 // In terms of code-size, the shuffle vector is free when a load + dup get
6351 // folded into a LD1R. That's what we check and return here. For performance
6352 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6353 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6354 // that we model the load + dup sequence slightly higher because LD1R is a
6355 // high latency instruction.
6356 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6357 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6358 if (IsLoad && LT.second.isVector() &&
6359 isLegalBroadcastLoad(SrcTy->getElementType(),
6360 LT.second.getVectorElementCount()))
6361 return 0;
6362 }
6363
6364 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6365 // from the perfect shuffle tables.
6366 if (Mask.size() == 4 &&
6367 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6368 (SrcTy->getScalarSizeInBits() == 16 ||
6369 SrcTy->getScalarSizeInBits() == 32) &&
6370 all_of(Mask, [](int E) { return E < 8; }))
6371 return getPerfectShuffleCost(Mask);
6372
6373 // Check for other shuffles that are not SK_ kinds but we have native
6374 // instructions for, for example ZIP and UZP.
6375 unsigned Unused;
6376 if (LT.second.isFixedLengthVector() &&
6377 LT.second.getVectorNumElements() == Mask.size() &&
6378 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6379 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6380 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6381 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6382 Kind == TTI::SK_InsertSubvector) &&
6383 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6384 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6385 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6386 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6387 LT.second.getVectorNumElements(), 16) ||
6388 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6389 LT.second.getVectorNumElements(), 32) ||
6390 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6391 LT.second.getVectorNumElements(), 64) ||
6392 // Check for non-zero lane splats
6393 all_of(drop_begin(Mask),
6394 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6395 return 1;
6396
6397 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6398 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6399 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6400 static const CostTblEntry ShuffleTbl[] = {
6401 // Broadcast shuffle kinds can be performed with 'dup'.
6402 {TTI::SK_Broadcast, MVT::v8i8, 1},
6403 {TTI::SK_Broadcast, MVT::v16i8, 1},
6404 {TTI::SK_Broadcast, MVT::v4i16, 1},
6405 {TTI::SK_Broadcast, MVT::v8i16, 1},
6406 {TTI::SK_Broadcast, MVT::v2i32, 1},
6407 {TTI::SK_Broadcast, MVT::v4i32, 1},
6408 {TTI::SK_Broadcast, MVT::v2i64, 1},
6409 {TTI::SK_Broadcast, MVT::v4f16, 1},
6410 {TTI::SK_Broadcast, MVT::v8f16, 1},
6411 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6412 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6413 {TTI::SK_Broadcast, MVT::v2f32, 1},
6414 {TTI::SK_Broadcast, MVT::v4f32, 1},
6415 {TTI::SK_Broadcast, MVT::v2f64, 1},
6416 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6417 // 'zip1/zip2' instructions.
6418 {TTI::SK_Transpose, MVT::v8i8, 1},
6419 {TTI::SK_Transpose, MVT::v16i8, 1},
6420 {TTI::SK_Transpose, MVT::v4i16, 1},
6421 {TTI::SK_Transpose, MVT::v8i16, 1},
6422 {TTI::SK_Transpose, MVT::v2i32, 1},
6423 {TTI::SK_Transpose, MVT::v4i32, 1},
6424 {TTI::SK_Transpose, MVT::v2i64, 1},
6425 {TTI::SK_Transpose, MVT::v4f16, 1},
6426 {TTI::SK_Transpose, MVT::v8f16, 1},
6427 {TTI::SK_Transpose, MVT::v4bf16, 1},
6428 {TTI::SK_Transpose, MVT::v8bf16, 1},
6429 {TTI::SK_Transpose, MVT::v2f32, 1},
6430 {TTI::SK_Transpose, MVT::v4f32, 1},
6431 {TTI::SK_Transpose, MVT::v2f64, 1},
6432 // Select shuffle kinds.
6433 // TODO: handle vXi8/vXi16.
6434 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6435 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6436 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6437 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6438 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6439 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6440 // PermuteSingleSrc shuffle kinds.
6441 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6442 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6443 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6444 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6445 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6446 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6447 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6448 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6449 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6450 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6451 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6452 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6453 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6454 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6455 // Reverse can be lowered with `rev`.
6456 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6457 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6458 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6459 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6460 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6461 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6462 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6463 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6464 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6465 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6466 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6467 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6468 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6469 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6470 // Splice can all be lowered as `ext`.
6471 {TTI::SK_Splice, MVT::v2i32, 1},
6472 {TTI::SK_Splice, MVT::v4i32, 1},
6473 {TTI::SK_Splice, MVT::v2i64, 1},
6474 {TTI::SK_Splice, MVT::v2f32, 1},
6475 {TTI::SK_Splice, MVT::v4f32, 1},
6476 {TTI::SK_Splice, MVT::v2f64, 1},
6477 {TTI::SK_Splice, MVT::v8f16, 1},
6478 {TTI::SK_Splice, MVT::v8bf16, 1},
6479 {TTI::SK_Splice, MVT::v8i16, 1},
6480 {TTI::SK_Splice, MVT::v16i8, 1},
6481 {TTI::SK_Splice, MVT::v4f16, 1},
6482 {TTI::SK_Splice, MVT::v4bf16, 1},
6483 {TTI::SK_Splice, MVT::v4i16, 1},
6484 {TTI::SK_Splice, MVT::v8i8, 1},
6485 // Broadcast shuffle kinds for scalable vectors
6486 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6487 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6488 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6489 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6490 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6491 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6492 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6493 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6494 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6495 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6496 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6497 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6498 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6499 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6500 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6501 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6502 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6503 // Handle the cases for vector.reverse with scalable vectors
6504 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6505 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6506 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6507 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6508 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6509 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6510 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6511 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6512 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6513 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6514 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6515 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6516 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6517 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6518 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6519 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6520 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6521 };
6522 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6523 return LT.first * Entry->Cost;
6524 }
6525
6526 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6527 return getSpliceCost(SrcTy, Index, CostKind);
6528
6529 // Inserting a subvector can often be done with either a D, S or H register
6530 // move, so long as the inserted vector is "aligned".
6531 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6532 LT.second.getSizeInBits() <= 128 && SubTp) {
6533 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6534 if (SubLT.second.isVector()) {
6535 int NumElts = LT.second.getVectorNumElements();
6536 int NumSubElts = SubLT.second.getVectorNumElements();
6537 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6538 return SubLT.first;
6539 }
6540 }
6541
6542 // Restore optimal kind.
6543 if (IsExtractSubvector)
6545 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6546 Args, CxtI);
6547}
6548
6551 const DominatorTree &DT) {
6552 const auto &Strides = DenseMap<Value *, const SCEV *>();
6553 for (BasicBlock *BB : TheLoop->blocks()) {
6554 // Scan the instructions in the block and look for addresses that are
6555 // consecutive and decreasing.
6556 for (Instruction &I : *BB) {
6557 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6559 Type *AccessTy = getLoadStoreType(&I);
6560 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6561 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6562 .value_or(0) < 0)
6563 return true;
6564 }
6565 }
6566 }
6567 return false;
6568}
6569
6571 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6573 // For cases like post-LTO vectorization, when we eventually know the trip
6574 // count, epilogue with fixed-width vectorization can be deleted if the trip
6575 // count is less than the epilogue iterations. That's why we prefer
6576 // fixed-width vectorization in epilogue in case of equal costs.
6577 if (IsEpilogue)
6578 return true;
6579 return ST->useFixedOverScalableIfEqualCost();
6580}
6581
6583 return ST->getEpilogueVectorizationMinVF();
6584}
6585
6587 if (!ST->hasSVE())
6588 return false;
6589
6590 // We don't currently support vectorisation with interleaving for SVE - with
6591 // such loops we're better off not using tail-folding. This gives us a chance
6592 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6593 if (TFI->IAI->hasGroups())
6594 return false;
6595
6597 if (TFI->LVL->getReductionVars().size())
6598 Required |= TailFoldingOpts::Reductions;
6599 if (TFI->LVL->getFixedOrderRecurrences().size())
6600 Required |= TailFoldingOpts::Recurrences;
6601
6602 // We call this to discover whether any load/store pointers in the loop have
6603 // negative strides. This will require extra work to reverse the loop
6604 // predicate, which may be expensive.
6607 *TFI->LVL->getDominatorTree()))
6608 Required |= TailFoldingOpts::Reverse;
6609 if (Required == TailFoldingOpts::Disabled)
6610 Required |= TailFoldingOpts::Simple;
6611
6612 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6613 Required))
6614 return false;
6615
6616 // Don't tail-fold for tight loops where we would be better off interleaving
6617 // with an unpredicated loop.
6618 unsigned NumInsns = 0;
6619 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6620 NumInsns += BB->size();
6621 }
6622
6623 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6624 return NumInsns >= SVETailFoldInsnThreshold;
6625}
6626
6629 StackOffset BaseOffset, bool HasBaseReg,
6630 int64_t Scale, unsigned AddrSpace) const {
6631 // Scaling factors are not free at all.
6632 // Operands | Rt Latency
6633 // -------------------------------------------
6634 // Rt, [Xn, Xm] | 4
6635 // -------------------------------------------
6636 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6637 // Rt, [Xn, Wm, <extend> #imm] |
6639 AM.BaseGV = BaseGV;
6640 AM.BaseOffs = BaseOffset.getFixed();
6641 AM.HasBaseReg = HasBaseReg;
6642 AM.Scale = Scale;
6643 AM.ScalableOffset = BaseOffset.getScalable();
6644 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6645 // Scale represents reg2 * scale, thus account for 1 if
6646 // it is not equal to 0 or 1.
6647 return AM.Scale != 0 && AM.Scale != 1;
6649}
6650
6652 const Instruction *I) const {
6654 // For the binary operators (e.g. or) we need to be more careful than
6655 // selects, here we only transform them if they are already at a natural
6656 // break point in the code - the end of a block with an unconditional
6657 // terminator.
6658 if (I->getOpcode() == Instruction::Or &&
6659 isa<UncondBrInst>(I->getNextNode()))
6660 return true;
6661
6662 if (I->getOpcode() == Instruction::Add ||
6663 I->getOpcode() == Instruction::Sub)
6664 return true;
6665 }
6667}
6668
6671 const TargetTransformInfo::LSRCost &C2) const {
6672 // AArch64 specific here is adding the number of instructions to the
6673 // comparison (though not as the first consideration, as some targets do)
6674 // along with changing the priority of the base additions.
6675 // TODO: Maybe a more nuanced tradeoff between instruction count
6676 // and number of registers? To be investigated at a later date.
6677 if (EnableLSRCostOpt)
6678 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6679 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6680 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6681 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6682
6684}
6685
6686static bool isSplatShuffle(Value *V) {
6687 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6688 return all_equal(Shuf->getShuffleMask());
6689 return false;
6690}
6691
6692/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6693/// or upper half of the vector elements.
6694static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6695 bool AllowSplat = false) {
6696 // Scalable types can't be extract shuffle vectors.
6697 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6698 return false;
6699
6700 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6701 auto *FullTy = FullV->getType();
6702 auto *HalfTy = HalfV->getType();
6703 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6704 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6705 };
6706
6707 auto extractHalf = [](Value *FullV, Value *HalfV) {
6708 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6709 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6710 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6711 };
6712
6713 ArrayRef<int> M1, M2;
6714 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6715 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6716 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6717 return false;
6718
6719 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6720 // it is not checked as an extract below.
6721 if (AllowSplat && isSplatShuffle(Op1))
6722 S1Op1 = nullptr;
6723 if (AllowSplat && isSplatShuffle(Op2))
6724 S2Op1 = nullptr;
6725
6726 // Check that the operands are half as wide as the result and we extract
6727 // half of the elements of the input vectors.
6728 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6729 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6730 return false;
6731
6732 // Check the mask extracts either the lower or upper half of vector
6733 // elements.
6734 int M1Start = 0;
6735 int M2Start = 0;
6736 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6737 if ((S1Op1 &&
6738 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6739 (S2Op1 &&
6740 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6741 return false;
6742
6743 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6744 (M2Start != 0 && M2Start != (NumElements / 2)))
6745 return false;
6746 if (S1Op1 && S2Op1 && M1Start != M2Start)
6747 return false;
6748
6749 return true;
6750}
6751
6752/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6753/// of the vector elements.
6754static bool areExtractExts(Value *Ext1, Value *Ext2) {
6755 auto areExtDoubled = [](Instruction *Ext) {
6756 return Ext->getType()->getScalarSizeInBits() ==
6757 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6758 };
6759
6760 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6761 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6762 !areExtDoubled(cast<Instruction>(Ext1)) ||
6763 !areExtDoubled(cast<Instruction>(Ext2)))
6764 return false;
6765
6766 return true;
6767}
6768
6769/// Check if Op could be used with vmull_high_p64 intrinsic.
6771 Value *VectorOperand = nullptr;
6772 ConstantInt *ElementIndex = nullptr;
6773 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6774 m_ConstantInt(ElementIndex))) &&
6775 ElementIndex->getValue() == 1 &&
6776 isa<FixedVectorType>(VectorOperand->getType()) &&
6777 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6778}
6779
6780/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6781static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6783}
6784
6786 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6787 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6788 if (!GEP || GEP->getNumOperands() != 2)
6789 return false;
6790
6791 Value *Base = GEP->getOperand(0);
6792 Value *Offsets = GEP->getOperand(1);
6793
6794 // We only care about scalar_base+vector_offsets.
6795 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6796 return false;
6797
6798 // Sink extends that would allow us to use 32-bit offset vectors.
6799 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6800 auto *OffsetsInst = cast<Instruction>(Offsets);
6801 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6802 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6803 Ops.push_back(&GEP->getOperandUse(1));
6804 }
6805
6806 // Sink the GEP.
6807 return true;
6808}
6809
6810/// We want to sink following cases:
6811/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6812/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6814 if (match(Op, m_VScale()))
6815 return true;
6816 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6818 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6819 return true;
6820 }
6821 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6823 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6824 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6825 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6826 return true;
6827 }
6828 return false;
6829}
6830
6831static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6832
6833/// Check if sinking \p I's operands to I's basic block is profitable, because
6834/// the operands can be folded into a target instruction, e.g.
6835/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6839 switch (II->getIntrinsicID()) {
6840 case Intrinsic::aarch64_neon_smull:
6841 case Intrinsic::aarch64_neon_umull:
6842 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6843 /*AllowSplat=*/true)) {
6844 Ops.push_back(&II->getOperandUse(0));
6845 Ops.push_back(&II->getOperandUse(1));
6846 return true;
6847 }
6848 [[fallthrough]];
6849
6850 case Intrinsic::fma:
6851 case Intrinsic::fmuladd:
6852 if (isa<VectorType>(I->getType()) &&
6853 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6854 !ST->hasFullFP16())
6855 return false;
6856
6857 if (isFNeg(II->getOperand(0)))
6858 Ops.push_back(&II->getOperandUse(0));
6859 if (isFNeg(II->getOperand(1)))
6860 Ops.push_back(&II->getOperandUse(1));
6861
6862 [[fallthrough]];
6863 case Intrinsic::aarch64_neon_sqdmull:
6864 case Intrinsic::aarch64_neon_sqdmulh:
6865 case Intrinsic::aarch64_neon_sqrdmulh:
6866 // Sink splats for index lane variants
6867 if (isSplatShuffle(II->getOperand(0)))
6868 Ops.push_back(&II->getOperandUse(0));
6869 if (isSplatShuffle(II->getOperand(1)))
6870 Ops.push_back(&II->getOperandUse(1));
6871 return !Ops.empty();
6872 case Intrinsic::aarch64_neon_fmlal:
6873 case Intrinsic::aarch64_neon_fmlal2:
6874 case Intrinsic::aarch64_neon_fmlsl:
6875 case Intrinsic::aarch64_neon_fmlsl2:
6876 // Sink splats for index lane variants
6877 if (isSplatShuffle(II->getOperand(1)))
6878 Ops.push_back(&II->getOperandUse(1));
6879 if (isSplatShuffle(II->getOperand(2)))
6880 Ops.push_back(&II->getOperandUse(2));
6881 return !Ops.empty();
6882 case Intrinsic::aarch64_sve_ptest_first:
6883 case Intrinsic::aarch64_sve_ptest_last:
6884 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6885 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6886 Ops.push_back(&II->getOperandUse(0));
6887 return !Ops.empty();
6888 case Intrinsic::aarch64_sme_write_horiz:
6889 case Intrinsic::aarch64_sme_write_vert:
6890 case Intrinsic::aarch64_sme_writeq_horiz:
6891 case Intrinsic::aarch64_sme_writeq_vert: {
6892 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6893 if (!Idx || Idx->getOpcode() != Instruction::Add)
6894 return false;
6895 Ops.push_back(&II->getOperandUse(1));
6896 return true;
6897 }
6898 case Intrinsic::aarch64_sme_read_horiz:
6899 case Intrinsic::aarch64_sme_read_vert:
6900 case Intrinsic::aarch64_sme_readq_horiz:
6901 case Intrinsic::aarch64_sme_readq_vert:
6902 case Intrinsic::aarch64_sme_ld1b_vert:
6903 case Intrinsic::aarch64_sme_ld1h_vert:
6904 case Intrinsic::aarch64_sme_ld1w_vert:
6905 case Intrinsic::aarch64_sme_ld1d_vert:
6906 case Intrinsic::aarch64_sme_ld1q_vert:
6907 case Intrinsic::aarch64_sme_st1b_vert:
6908 case Intrinsic::aarch64_sme_st1h_vert:
6909 case Intrinsic::aarch64_sme_st1w_vert:
6910 case Intrinsic::aarch64_sme_st1d_vert:
6911 case Intrinsic::aarch64_sme_st1q_vert:
6912 case Intrinsic::aarch64_sme_ld1b_horiz:
6913 case Intrinsic::aarch64_sme_ld1h_horiz:
6914 case Intrinsic::aarch64_sme_ld1w_horiz:
6915 case Intrinsic::aarch64_sme_ld1d_horiz:
6916 case Intrinsic::aarch64_sme_ld1q_horiz:
6917 case Intrinsic::aarch64_sme_st1b_horiz:
6918 case Intrinsic::aarch64_sme_st1h_horiz:
6919 case Intrinsic::aarch64_sme_st1w_horiz:
6920 case Intrinsic::aarch64_sme_st1d_horiz:
6921 case Intrinsic::aarch64_sme_st1q_horiz: {
6922 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6923 if (!Idx || Idx->getOpcode() != Instruction::Add)
6924 return false;
6925 Ops.push_back(&II->getOperandUse(3));
6926 return true;
6927 }
6928 case Intrinsic::aarch64_neon_pmull:
6929 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6930 return false;
6931 Ops.push_back(&II->getOperandUse(0));
6932 Ops.push_back(&II->getOperandUse(1));
6933 return true;
6934 case Intrinsic::aarch64_neon_pmull64:
6935 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6936 II->getArgOperand(1)))
6937 return false;
6938 Ops.push_back(&II->getArgOperandUse(0));
6939 Ops.push_back(&II->getArgOperandUse(1));
6940 return true;
6941 case Intrinsic::masked_gather:
6942 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6943 return false;
6944 Ops.push_back(&II->getArgOperandUse(0));
6945 return true;
6946 case Intrinsic::masked_scatter:
6947 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6948 return false;
6949 Ops.push_back(&II->getArgOperandUse(1));
6950 return true;
6951 default:
6952 return false;
6953 }
6954 }
6955
6956 auto ShouldSinkCondition = [](Value *Cond,
6957 SmallVectorImpl<Use *> &Ops) -> bool {
6959 return false;
6961 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6962 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6963 return false;
6964 if (isa<CmpInst>(II->getOperand(0)))
6965 Ops.push_back(&II->getOperandUse(0));
6966 return true;
6967 };
6968
6969 switch (I->getOpcode()) {
6970 case Instruction::GetElementPtr:
6971 case Instruction::Add:
6972 case Instruction::Sub:
6973 // Sink vscales closer to uses for better isel
6974 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6975 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6976 Ops.push_back(&I->getOperandUse(Op));
6977 return true;
6978 }
6979 }
6980 break;
6981 case Instruction::Select: {
6982 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6983 return false;
6984
6985 Ops.push_back(&I->getOperandUse(0));
6986 return true;
6987 }
6988 case Instruction::UncondBr:
6989 return false;
6990 case Instruction::CondBr: {
6991 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
6992 return false;
6993
6994 Ops.push_back(&I->getOperandUse(0));
6995 return true;
6996 }
6997 case Instruction::FMul:
6998 // fmul with contract flag can be combined with fadd into fma.
6999 // Sinking fneg into this block enables fmls pattern.
7000 if (cast<FPMathOperator>(I)->hasAllowContract()) {
7001 if (isFNeg(I->getOperand(0)))
7002 Ops.push_back(&I->getOperandUse(0));
7003 if (isFNeg(I->getOperand(1)))
7004 Ops.push_back(&I->getOperandUse(1));
7005 }
7006 break;
7007
7008 // Type | BIC | ORN | EON
7009 // ----------------+-----------+-----------+-----------
7010 // scalar | Base | Base | Base
7011 // scalar w/shift | - | - | -
7012 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
7013 // scalable vector | SVE | - | BSL2N
7014 case Instruction::Xor:
7015 // EON only for scalars (possibly expanded fixed vectors)
7016 // and vectors using the SVE2/SME BSL2N instruction.
7017 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
7018 bool HasBSL2N =
7019 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
7020 if (!HasBSL2N)
7021 break;
7022 }
7023 [[fallthrough]];
7024 case Instruction::And:
7025 case Instruction::Or:
7026 // Even though we could use the SVE2/SME BSL2N instruction,
7027 // it might pessimize with an extra MOV depending on register allocation.
7028 if (I->getOpcode() == Instruction::Or &&
7029 isa<ScalableVectorType>(I->getType()))
7030 break;
7031 // Shift can be fold into scalar AND/ORR/EOR,
7032 // but not the non-negated operand of BIC/ORN/EON.
7033 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
7035 break;
7036 for (auto &Op : I->operands()) {
7037 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
7038 if (match(Op.get(), m_Not(m_Value()))) {
7039 Ops.push_back(&Op);
7040 return true;
7041 }
7042 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
7043 if (match(Op.get(),
7045 m_Value(), m_ZeroMask()))) {
7046 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7047 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7048 Ops.push_back(&Not);
7049 Ops.push_back(&InsertElt);
7050 Ops.push_back(&Op);
7051 return true;
7052 }
7053 }
7054 break;
7055 default:
7056 break;
7057 }
7058
7059 if (!I->getType()->isVectorTy())
7060 return !Ops.empty();
7061
7062 switch (I->getOpcode()) {
7063 case Instruction::Sub:
7064 case Instruction::Add: {
7065 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7066 return false;
7067
7068 // If the exts' operands extract either the lower or upper elements, we
7069 // can sink them too.
7070 auto Ext1 = cast<Instruction>(I->getOperand(0));
7071 auto Ext2 = cast<Instruction>(I->getOperand(1));
7072 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7073 Ops.push_back(&Ext1->getOperandUse(0));
7074 Ops.push_back(&Ext2->getOperandUse(0));
7075 }
7076
7077 Ops.push_back(&I->getOperandUse(0));
7078 Ops.push_back(&I->getOperandUse(1));
7079
7080 return true;
7081 }
7082 case Instruction::Or: {
7083 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7084 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7085 if (ST->hasNEON()) {
7086 Instruction *OtherAnd, *IA, *IB;
7087 Value *MaskValue;
7088 // MainAnd refers to And instruction that has 'Not' as one of its operands
7089 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7090 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7091 m_Instruction(IA)))))) {
7092 if (match(OtherAnd,
7093 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7094 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7095 ? cast<Instruction>(I->getOperand(1))
7096 : cast<Instruction>(I->getOperand(0));
7097
7098 // Both Ands should be in same basic block as Or
7099 if (I->getParent() != MainAnd->getParent() ||
7100 I->getParent() != OtherAnd->getParent())
7101 return false;
7102
7103 // Non-mask operands of both Ands should also be in same basic block
7104 if (I->getParent() != IA->getParent() ||
7105 I->getParent() != IB->getParent())
7106 return false;
7107
7108 Ops.push_back(
7109 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7110 Ops.push_back(&I->getOperandUse(0));
7111 Ops.push_back(&I->getOperandUse(1));
7112
7113 return true;
7114 }
7115 }
7116 }
7117
7118 return false;
7119 }
7120 case Instruction::Mul: {
7121 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7122 auto *Ty = cast<VectorType>(V->getType());
7123 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7124 if (Ty->isScalableTy())
7125 return false;
7126
7127 // Indexed variants of Mul exist for i16 and i32 element types only.
7128 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7129 };
7130
7131 int NumZExts = 0, NumSExts = 0;
7132 for (auto &Op : I->operands()) {
7133 // Make sure we are not already sinking this operand
7134 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7135 continue;
7136
7137 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7138 auto *Ext = cast<Instruction>(Op);
7139 auto *ExtOp = Ext->getOperand(0);
7140 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7141 Ops.push_back(&Ext->getOperandUse(0));
7142 Ops.push_back(&Op);
7143
7144 if (isa<SExtInst>(Ext)) {
7145 NumSExts++;
7146 } else {
7147 NumZExts++;
7148 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7149 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7150 I->getType()->getScalarSizeInBits())
7151 NumSExts++;
7152 }
7153
7154 continue;
7155 }
7156
7158 if (!Shuffle)
7159 continue;
7160
7161 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7162 // operand and the s/zext can help create indexed s/umull. This is
7163 // especially useful to prevent i64 mul being scalarized.
7164 if (isSplatShuffle(Shuffle) &&
7165 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7166 Ops.push_back(&Shuffle->getOperandUse(0));
7167 Ops.push_back(&Op);
7168 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7169 NumSExts++;
7170 else
7171 NumZExts++;
7172 continue;
7173 }
7174
7175 Value *ShuffleOperand = Shuffle->getOperand(0);
7176 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7177 if (!Insert)
7178 continue;
7179
7180 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7181 if (!OperandInstr)
7182 continue;
7183
7184 ConstantInt *ElementConstant =
7185 dyn_cast<ConstantInt>(Insert->getOperand(2));
7186 // Check that the insertelement is inserting into element 0
7187 if (!ElementConstant || !ElementConstant->isZero())
7188 continue;
7189
7190 unsigned Opcode = OperandInstr->getOpcode();
7191 if (Opcode == Instruction::SExt)
7192 NumSExts++;
7193 else if (Opcode == Instruction::ZExt)
7194 NumZExts++;
7195 else {
7196 // If we find that the top bits are known 0, then we can sink and allow
7197 // the backend to generate a umull.
7198 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7199 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7200 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7201 continue;
7202 NumZExts++;
7203 }
7204
7205 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7206 // the And, just to hoist it again back to the load.
7207 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7208 Ops.push_back(&Insert->getOperandUse(1));
7209 Ops.push_back(&Shuffle->getOperandUse(0));
7210 Ops.push_back(&Op);
7211 }
7212
7213 // It is profitable to sink if we found two of the same type of extends.
7214 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7215 return true;
7216
7217 // Otherwise, see if we should sink splats for indexed variants.
7218 if (!ShouldSinkSplatForIndexedVariant(I))
7219 return false;
7220
7221 Ops.clear();
7222 if (isSplatShuffle(I->getOperand(0)))
7223 Ops.push_back(&I->getOperandUse(0));
7224 if (isSplatShuffle(I->getOperand(1)))
7225 Ops.push_back(&I->getOperandUse(1));
7226
7227 return !Ops.empty();
7228 }
7229 case Instruction::FMul: {
7230 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7231 if (I->getType()->isScalableTy())
7232 return !Ops.empty();
7233
7234 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7235 !ST->hasFullFP16())
7236 return !Ops.empty();
7237
7238 // Sink splats for index lane variants
7239 if (isSplatShuffle(I->getOperand(0)))
7240 Ops.push_back(&I->getOperandUse(0));
7241 if (isSplatShuffle(I->getOperand(1)))
7242 Ops.push_back(&I->getOperandUse(1));
7243 return !Ops.empty();
7244 }
7245 default:
7246 return false;
7247 }
7248 return false;
7249}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:254
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839
bool isUnsigned() const
Definition InstrTypes.h:999
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
bool empty() const
Definition DenseMap.h:173
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:216
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
bool approxFunc() const
Definition FMF.h:70
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2637
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1149
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2625
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:619
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:604
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2020
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2334
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2549
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1759
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2252
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1928
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2659
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1941
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2325
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
bool isFixedLengthVector() const
MVT getVectorElementType() const
size_type size() const
Definition MapVector.h:58
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:891
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:989
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:382
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool isFixedLengthVector() const
Definition ValueTypes.h:199
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:187
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
bool isVariant() const
Definition MCSchedule.h:150
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:264
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...