LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::scmp:
655 case Intrinsic::ucmp: {
656 static const CostTblEntry BitreverseTbl[] = {
657 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
658 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
659 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
660 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
661 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
662 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
663 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
664 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
665 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
666 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
667 };
668 const auto LT = getTypeLegalizationCost(RetTy);
669 const auto *Entry =
670 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
671 if (Entry)
672 return Entry->Cost * LT.first;
673 break;
674 }
675 case Intrinsic::sadd_sat:
676 case Intrinsic::ssub_sat:
677 case Intrinsic::uadd_sat:
678 case Intrinsic::usub_sat: {
679 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680 MVT::v8i16, MVT::v2i32, MVT::v4i32,
681 MVT::v2i64};
682 auto LT = getTypeLegalizationCost(RetTy);
683 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684 // need to extend the type, as it uses shr(qadd(shl, shl)).
685 unsigned Instrs =
686 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
687 if (any_of(ValidSatTys, equal_to(LT.second)))
688 return LT.first * Instrs;
689
691 uint64_t VectorSize = TS.getKnownMinValue();
692
693 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
694 return LT.first * Instrs;
695
696 break;
697 }
698 case Intrinsic::abs: {
699 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700 MVT::v8i16, MVT::v2i32, MVT::v4i32,
701 MVT::v2i64};
702 auto LT = getTypeLegalizationCost(RetTy);
703 if (any_of(ValidAbsTys, equal_to(LT.second)))
704 return LT.first;
705 break;
706 }
707 case Intrinsic::bswap: {
708 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
709 MVT::v4i32, MVT::v2i64};
710 auto LT = getTypeLegalizationCost(RetTy);
711 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
712 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
713 return LT.first;
714 break;
715 }
716 case Intrinsic::fma:
717 case Intrinsic::fmuladd: {
718 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
719 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
720 Type *EltTy = RetTy->getScalarType();
721 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
722 (EltTy->isHalfTy() && ST->hasFullFP16()))
723 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
724 break;
725 }
726 case Intrinsic::stepvector: {
727 InstructionCost Cost = 1; // Cost of the `index' instruction
728 auto LT = getTypeLegalizationCost(RetTy);
729 // Legalisation of illegal vectors involves an `index' instruction plus
730 // (LT.first - 1) vector adds.
731 if (LT.first > 1) {
732 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
733 InstructionCost AddCost =
734 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
735 Cost += AddCost * (LT.first - 1);
736 }
737 return Cost;
738 }
739 case Intrinsic::vector_extract:
740 case Intrinsic::vector_insert: {
741 // If both the vector and subvector types are legal types and the index
742 // is 0, then this should be a no-op or simple operation; return a
743 // relatively low cost.
744
745 // If arguments aren't actually supplied, then we cannot determine the
746 // value of the index. We also want to skip predicate types.
747 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
749 break;
750
751 LLVMContext &C = RetTy->getContext();
752 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
753 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
754 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
755 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
756 // Skip this if either the vector or subvector types are unpacked
757 // SVE types; they may get lowered to stack stores and loads.
758 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
759 break;
760
762 getTLI()->getTypeConversion(C, SubVecVT);
764 getTLI()->getTypeConversion(C, VecVT);
765 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
766 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
767 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
768 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
769 return TTI::TCC_Free;
770 break;
771 }
772 case Intrinsic::bitreverse: {
773 static const CostTblEntry BitreverseTbl[] = {
774 {Intrinsic::bitreverse, MVT::i32, 1},
775 {Intrinsic::bitreverse, MVT::i64, 1},
776 {Intrinsic::bitreverse, MVT::v8i8, 1},
777 {Intrinsic::bitreverse, MVT::v16i8, 1},
778 {Intrinsic::bitreverse, MVT::v4i16, 2},
779 {Intrinsic::bitreverse, MVT::v8i16, 2},
780 {Intrinsic::bitreverse, MVT::v2i32, 2},
781 {Intrinsic::bitreverse, MVT::v4i32, 2},
782 {Intrinsic::bitreverse, MVT::v1i64, 2},
783 {Intrinsic::bitreverse, MVT::v2i64, 2},
784 };
785 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
786 const auto *Entry =
787 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
788 if (Entry) {
789 // Cost Model is using the legal type(i32) that i8 and i16 will be
790 // converted to +1 so that we match the actual lowering cost
791 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
792 TLI->getValueType(DL, RetTy, true) == MVT::i16)
793 return LegalisationCost.first * Entry->Cost + 1;
794
795 return LegalisationCost.first * Entry->Cost;
796 }
797 break;
798 }
799 case Intrinsic::ctpop: {
800 if (!ST->hasNEON()) {
801 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
802 return getTypeLegalizationCost(RetTy).first * 12;
803 }
804 static const CostTblEntry CtpopCostTbl[] = {
805 {ISD::CTPOP, MVT::v2i64, 4},
806 {ISD::CTPOP, MVT::v4i32, 3},
807 {ISD::CTPOP, MVT::v8i16, 2},
808 {ISD::CTPOP, MVT::v16i8, 1},
809 {ISD::CTPOP, MVT::i64, 4},
810 {ISD::CTPOP, MVT::v2i32, 3},
811 {ISD::CTPOP, MVT::v4i16, 2},
812 {ISD::CTPOP, MVT::v8i8, 1},
813 {ISD::CTPOP, MVT::i32, 5},
814 };
815 auto LT = getTypeLegalizationCost(RetTy);
816 MVT MTy = LT.second;
817 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
818 // Extra cost of +1 when illegal vector types are legalized by promoting
819 // the integer type.
820 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
821 RetTy->getScalarSizeInBits()
822 ? 1
823 : 0;
824 return LT.first * Entry->Cost + ExtraCost;
825 }
826 break;
827 }
828 case Intrinsic::sadd_with_overflow:
829 case Intrinsic::uadd_with_overflow:
830 case Intrinsic::ssub_with_overflow:
831 case Intrinsic::usub_with_overflow:
832 case Intrinsic::smul_with_overflow:
833 case Intrinsic::umul_with_overflow: {
834 static const CostTblEntry WithOverflowCostTbl[] = {
835 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
836 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
837 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
838 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
839 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
840 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
841 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
842 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
843 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
844 {Intrinsic::usub_with_overflow, MVT::i8, 3},
845 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
846 {Intrinsic::usub_with_overflow, MVT::i16, 3},
847 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
848 {Intrinsic::usub_with_overflow, MVT::i32, 1},
849 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
850 {Intrinsic::usub_with_overflow, MVT::i64, 1},
851 {Intrinsic::smul_with_overflow, MVT::i8, 5},
852 {Intrinsic::umul_with_overflow, MVT::i8, 4},
853 {Intrinsic::smul_with_overflow, MVT::i16, 5},
854 {Intrinsic::umul_with_overflow, MVT::i16, 4},
855 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
856 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
857 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
858 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
859 };
860 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
861 if (MTy.isSimple())
862 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
863 MTy.getSimpleVT()))
864 return Entry->Cost;
865 break;
866 }
867 case Intrinsic::fptosi_sat:
868 case Intrinsic::fptoui_sat: {
869 if (ICA.getArgTypes().empty())
870 break;
871 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
872 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
873 EVT MTy = TLI->getValueType(DL, RetTy);
874 // Check for the legal types, which are where the size of the input and the
875 // output are the same, or we are using cvt f64->i32 or f32->i64.
876 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
877 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
878 LT.second == MVT::v2f64)) {
879 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
880 (LT.second == MVT::f64 && MTy == MVT::i32) ||
881 (LT.second == MVT::f32 && MTy == MVT::i64)))
882 return LT.first;
883 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
884 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
885 MTy.getScalarSizeInBits() == 64)
886 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
887 }
888 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
889 // f32.
890 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
891 return LT.first + getIntrinsicInstrCost(
892 {ICA.getID(),
893 RetTy,
894 {ICA.getArgTypes()[0]->getWithNewType(
895 Type::getFloatTy(RetTy->getContext()))}},
896 CostKind);
897 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
898 (LT.second == MVT::f16 && MTy == MVT::i64) ||
899 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
900 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
901 return LT.first;
902 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
903 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
904 MTy.getScalarSizeInBits() == 32)
905 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
906 // Extending vector types v8f16->v8i32. These current scalarize but the
907 // codegen could be better.
908 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
909 MTy.getScalarSizeInBits() == 64)
910 return MTy.getVectorNumElements() * 3;
911
912 // If we can we use a legal convert followed by a min+max
913 if ((LT.second.getScalarType() == MVT::f32 ||
914 LT.second.getScalarType() == MVT::f64 ||
915 LT.second.getScalarType() == MVT::f16) &&
916 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
917 Type *LegalTy =
918 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
919 if (LT.second.isVector())
920 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
922 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
923 LegalTy, {LegalTy, LegalTy});
925 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
926 LegalTy, {LegalTy, LegalTy});
928 return LT.first * Cost +
929 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
930 : 1);
931 }
932 // Otherwise we need to follow the default expansion that clamps the value
933 // using a float min/max with a fcmp+sel for nan handling when signed.
934 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
935 RetTy = RetTy->getScalarType();
936 if (LT.second.isVector()) {
937 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
938 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
939 }
940 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
942 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
944 Cost +=
945 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
947 if (IsSigned) {
948 Type *CondTy = RetTy->getWithNewBitWidth(1);
949 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
951 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
953 }
954 return LT.first * Cost;
955 }
956 case Intrinsic::fshl:
957 case Intrinsic::fshr: {
958 if (ICA.getArgs().empty())
959 break;
960
961 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
962
963 // ROTR / ROTL is a funnel shift with equal first and second operand. For
964 // ROTR on integer registers (i32/i64) this can be done in a single ror
965 // instruction. A fshl with a non-constant shift uses a neg + ror.
966 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
967 (RetTy->getPrimitiveSizeInBits() == 32 ||
968 RetTy->getPrimitiveSizeInBits() == 64)) {
969 InstructionCost NegCost =
970 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
971 return 1 + NegCost;
972 }
973
974 // TODO: Add handling for fshl where third argument is not a constant.
975 if (!OpInfoZ.isConstant())
976 break;
977
978 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
979 if (OpInfoZ.isUniform()) {
980 static const CostTblEntry FshlTbl[] = {
981 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
982 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
983 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
984 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
985 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
986 // to avoid having to duplicate the costs.
987 const auto *Entry =
988 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
989 if (Entry)
990 return LegalisationCost.first * Entry->Cost;
991 }
992
993 auto TyL = getTypeLegalizationCost(RetTy);
994 if (!RetTy->isIntegerTy())
995 break;
996
997 // Estimate cost manually, as types like i8 and i16 will get promoted to
998 // i32 and CostTableLookup will ignore the extra conversion cost.
999 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1000 RetTy->getScalarSizeInBits() < 64) ||
1001 (RetTy->getScalarSizeInBits() % 64 != 0);
1002 unsigned ExtraCost = HigherCost ? 1 : 0;
1003 if (RetTy->getScalarSizeInBits() == 32 ||
1004 RetTy->getScalarSizeInBits() == 64)
1005 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1006 // extr instruction.
1007 else if (HigherCost)
1008 ExtraCost = 1;
1009 else
1010 break;
1011 return TyL.first + ExtraCost;
1012 }
1013 case Intrinsic::get_active_lane_mask: {
1014 auto RetTy = cast<VectorType>(ICA.getReturnType());
1015 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1016 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1017 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1018 break;
1019
1020 if (RetTy->isScalableTy()) {
1021 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1023 break;
1024
1025 auto LT = getTypeLegalizationCost(RetTy);
1026 InstructionCost Cost = LT.first;
1027 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1028 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1029 // nxv32i1 = get_active_lane_mask(base, idx) ->
1030 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1031 if (ST->hasSVE2p1() || ST->hasSME2()) {
1032 Cost /= 2;
1033 if (Cost == 1)
1034 return Cost;
1035 }
1036
1037 // If more than one whilelo intrinsic is required, include the extra cost
1038 // required by the saturating add & select required to increment the
1039 // start value after the first intrinsic call.
1040 Type *OpTy = ICA.getArgTypes()[0];
1041 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1042 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1043 Type *CondTy = OpTy->getWithNewBitWidth(1);
1044 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1046 return Cost + (SplitCost * (Cost - 1));
1047 } else if (!getTLI()->isTypeLegal(RetVT)) {
1048 // We don't have enough context at this point to determine if the mask
1049 // is going to be kept live after the block, which will force the vXi1
1050 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1051 // For now, we just assume the vectorizer created this intrinsic and
1052 // the result will be the input for a PHI. In this case the cost will
1053 // be extremely high for fixed-width vectors.
1054 // NOTE: getScalarizationOverhead returns a cost that's far too
1055 // pessimistic for the actual generated codegen. In reality there are
1056 // two instructions generated per lane.
1057 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1058 }
1059 break;
1060 }
1061 case Intrinsic::experimental_vector_match: {
1062 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1063 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1064 unsigned SearchSize = NeedleTy->getNumElements();
1065 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1066 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1067 // Neoverse V3, these are cheap operations with the same latency as a
1068 // vector ADD. In most cases, however, we also need to do an extra DUP.
1069 // For fixed-length vectors we currently need an extra five--six
1070 // instructions besides the MATCH.
1072 if (isa<FixedVectorType>(RetTy))
1073 Cost += 10;
1074 return Cost;
1075 }
1076 break;
1077 }
1078 case Intrinsic::experimental_cttz_elts: {
1079 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1080 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1081 // This will consist of a SVE brkb and a cntp instruction. These
1082 // typically have the same latency and half the throughput as a vector
1083 // add instruction.
1084 return 4;
1085 }
1086 break;
1087 }
1088 case Intrinsic::loop_dependence_raw_mask:
1089 case Intrinsic::loop_dependence_war_mask: {
1090 // The whilewr/rw instructions require SVE2 or SME.
1091 if (ST->hasSVE2() || ST->hasSME()) {
1092 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1093 unsigned EltSizeInBytes =
1094 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1095 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1096 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1097 break;
1098 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1099 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1100 }
1101 break;
1102 }
1103 case Intrinsic::experimental_vector_extract_last_active:
1104 if (ST->isSVEorStreamingSVEAvailable()) {
1105 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1106 // This should turn into chained clastb instructions.
1107 return LegalCost;
1108 }
1109 break;
1110 case Intrinsic::pow: {
1111 EVT VT = getTLI()->getValueType(DL, RetTy);
1112 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1113 if (getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported)
1114 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1115 break;
1116 }
1117 default:
1118 break;
1119 }
1121}
1122
1123/// The function will remove redundant reinterprets casting in the presence
1124/// of the control flow
1125static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1126 IntrinsicInst &II) {
1128 auto RequiredType = II.getType();
1129
1130 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1131 assert(PN && "Expected Phi Node!");
1132
1133 // Don't create a new Phi unless we can remove the old one.
1134 if (!PN->hasOneUse())
1135 return std::nullopt;
1136
1137 for (Value *IncValPhi : PN->incoming_values()) {
1138 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1139 if (!Reinterpret ||
1140 Reinterpret->getIntrinsicID() !=
1141 Intrinsic::aarch64_sve_convert_to_svbool ||
1142 RequiredType != Reinterpret->getArgOperand(0)->getType())
1143 return std::nullopt;
1144 }
1145
1146 // Create the new Phi
1147 IC.Builder.SetInsertPoint(PN);
1148 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1149 Worklist.push_back(PN);
1150
1151 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1152 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1153 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1154 Worklist.push_back(Reinterpret);
1155 }
1156
1157 // Cleanup Phi Node and reinterprets
1158 return IC.replaceInstUsesWith(II, NPN);
1159}
1160
1161// A collection of properties common to SVE intrinsics that allow for combines
1162// to be written without needing to know the specific intrinsic.
1164 //
1165 // Helper routines for common intrinsic definitions.
1166 //
1167
1168 // e.g. llvm.aarch64.sve.add pg, op1, op2
1169 // with IID ==> llvm.aarch64.sve.add_u
1170 static SVEIntrinsicInfo
1177
1178 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1185
1186 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1192
1193 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1199
1200 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1201 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1202 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1203 return SVEIntrinsicInfo()
1206 }
1207
1208 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1209 // llvm.aarch64.sve.ld1 pg, ptr
1216
1217 // All properties relate to predication and thus having a general predicate
1218 // is the minimum requirement to say there is intrinsic info to act on.
1219 explicit operator bool() const { return hasGoverningPredicate(); }
1220
1221 //
1222 // Properties relating to the governing predicate.
1223 //
1224
1226 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1227 }
1228
1230 assert(hasGoverningPredicate() && "Propery not set!");
1231 return GoverningPredicateIdx;
1232 }
1233
1235 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1236 GoverningPredicateIdx = Index;
1237 return *this;
1238 }
1239
1240 //
1241 // Properties relating to operations the intrinsic could be transformed into.
1242 // NOTE: This does not mean such a transformation is always possible, but the
1243 // knowledge makes it possible to reuse existing optimisations without needing
1244 // to embed specific handling for each intrinsic. For example, instruction
1245 // simplification can be used to optimise an intrinsic's active lanes.
1246 //
1247
1249 return UndefIntrinsic != Intrinsic::not_intrinsic;
1250 }
1251
1253 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1254 return UndefIntrinsic;
1255 }
1256
1258 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1259 UndefIntrinsic = IID;
1260 return *this;
1261 }
1262
1263 bool hasMatchingIROpode() const { return IROpcode != 0; }
1264
1265 unsigned getMatchingIROpode() const {
1266 assert(hasMatchingIROpode() && "Propery not set!");
1267 return IROpcode;
1268 }
1269
1271 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1272 IROpcode = Opcode;
1273 return *this;
1274 }
1275
1276 //
1277 // Properties relating to the result of inactive lanes.
1278 //
1279
1281 return ResultLanes == InactiveLanesTakenFromOperand;
1282 }
1283
1285 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1286 return OperandIdxForInactiveLanes;
1287 }
1288
1290 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1291 ResultLanes = InactiveLanesTakenFromOperand;
1292 OperandIdxForInactiveLanes = Index;
1293 return *this;
1294 }
1295
1297 return ResultLanes == InactiveLanesAreNotDefined;
1298 }
1299
1301 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1302 ResultLanes = InactiveLanesAreNotDefined;
1303 return *this;
1304 }
1305
1307 return ResultLanes == InactiveLanesAreUnused;
1308 }
1309
1311 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1312 ResultLanes = InactiveLanesAreUnused;
1313 return *this;
1314 }
1315
1316 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1317 // inactiveLanesAreZeroed =
1318 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1319 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1320
1322 ResultIsZeroInitialized = true;
1323 return *this;
1324 }
1325
1326 //
1327 // The first operand of unary merging operations is typically only used to
1328 // set the result for inactive lanes. Knowing this allows us to deadcode the
1329 // operand when we can prove there are no inactive lanes.
1330 //
1331
1333 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1334 }
1335
1337 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1338 return OperandIdxWithNoActiveLanes;
1339 }
1340
1342 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1343 OperandIdxWithNoActiveLanes = Index;
1344 return *this;
1345 }
1346
1347private:
1348 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1349
1350 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1351 unsigned IROpcode = 0;
1352
1353 enum PredicationStyle {
1355 InactiveLanesTakenFromOperand,
1356 InactiveLanesAreNotDefined,
1357 InactiveLanesAreUnused
1358 } ResultLanes = Uninitialized;
1359
1360 bool ResultIsZeroInitialized = false;
1361 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1362 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1363};
1364
1366 // Some SVE intrinsics do not use scalable vector types, but since they are
1367 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1368 if (!isa<ScalableVectorType>(II.getType()) &&
1369 all_of(II.args(), [&](const Value *V) {
1370 return !isa<ScalableVectorType>(V->getType());
1371 }))
1372 return SVEIntrinsicInfo();
1373
1374 Intrinsic::ID IID = II.getIntrinsicID();
1375 switch (IID) {
1376 default:
1377 break;
1378 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1379 case Intrinsic::aarch64_sve_fcvt_f16f32:
1380 case Intrinsic::aarch64_sve_fcvt_f16f64:
1381 case Intrinsic::aarch64_sve_fcvt_f32f16:
1382 case Intrinsic::aarch64_sve_fcvt_f32f64:
1383 case Intrinsic::aarch64_sve_fcvt_f64f16:
1384 case Intrinsic::aarch64_sve_fcvt_f64f32:
1385 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1386 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1387 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1388 case Intrinsic::aarch64_sve_fcvtzs:
1389 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1390 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1391 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1392 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1393 case Intrinsic::aarch64_sve_fcvtzu:
1394 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1395 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1396 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1397 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1398 case Intrinsic::aarch64_sve_scvtf:
1399 case Intrinsic::aarch64_sve_scvtf_f16i32:
1400 case Intrinsic::aarch64_sve_scvtf_f16i64:
1401 case Intrinsic::aarch64_sve_scvtf_f32i64:
1402 case Intrinsic::aarch64_sve_scvtf_f64i32:
1403 case Intrinsic::aarch64_sve_ucvtf:
1404 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1405 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1406 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1407 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1409
1410 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1411 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1412 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1413 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1415
1416 case Intrinsic::aarch64_sve_fabd:
1417 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1418 case Intrinsic::aarch64_sve_fadd:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1420 .setMatchingIROpcode(Instruction::FAdd);
1421 case Intrinsic::aarch64_sve_fdiv:
1422 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1423 .setMatchingIROpcode(Instruction::FDiv);
1424 case Intrinsic::aarch64_sve_fmax:
1425 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1426 case Intrinsic::aarch64_sve_fmaxnm:
1427 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1428 case Intrinsic::aarch64_sve_fmin:
1429 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1430 case Intrinsic::aarch64_sve_fminnm:
1431 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1432 case Intrinsic::aarch64_sve_fmla:
1433 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1434 case Intrinsic::aarch64_sve_fmls:
1435 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1436 case Intrinsic::aarch64_sve_fmul:
1437 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1438 .setMatchingIROpcode(Instruction::FMul);
1439 case Intrinsic::aarch64_sve_fmulx:
1440 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1441 case Intrinsic::aarch64_sve_fnmla:
1442 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1443 case Intrinsic::aarch64_sve_fnmls:
1444 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1445 case Intrinsic::aarch64_sve_fsub:
1446 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1447 .setMatchingIROpcode(Instruction::FSub);
1448 case Intrinsic::aarch64_sve_add:
1449 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1450 .setMatchingIROpcode(Instruction::Add);
1451 case Intrinsic::aarch64_sve_mla:
1452 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1453 case Intrinsic::aarch64_sve_mls:
1454 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1455 case Intrinsic::aarch64_sve_mul:
1456 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1457 .setMatchingIROpcode(Instruction::Mul);
1458 case Intrinsic::aarch64_sve_sabd:
1459 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1460 case Intrinsic::aarch64_sve_sdiv:
1461 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1462 .setMatchingIROpcode(Instruction::SDiv);
1463 case Intrinsic::aarch64_sve_smax:
1464 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1465 case Intrinsic::aarch64_sve_smin:
1466 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1467 case Intrinsic::aarch64_sve_smulh:
1468 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1469 case Intrinsic::aarch64_sve_sub:
1470 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1471 .setMatchingIROpcode(Instruction::Sub);
1472 case Intrinsic::aarch64_sve_uabd:
1473 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1474 case Intrinsic::aarch64_sve_udiv:
1475 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1476 .setMatchingIROpcode(Instruction::UDiv);
1477 case Intrinsic::aarch64_sve_umax:
1478 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1479 case Intrinsic::aarch64_sve_umin:
1480 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1481 case Intrinsic::aarch64_sve_umulh:
1482 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1483 case Intrinsic::aarch64_sve_asr:
1484 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1485 .setMatchingIROpcode(Instruction::AShr);
1486 case Intrinsic::aarch64_sve_lsl:
1487 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1488 .setMatchingIROpcode(Instruction::Shl);
1489 case Intrinsic::aarch64_sve_lsr:
1490 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1491 .setMatchingIROpcode(Instruction::LShr);
1492 case Intrinsic::aarch64_sve_and:
1493 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1494 .setMatchingIROpcode(Instruction::And);
1495 case Intrinsic::aarch64_sve_bic:
1496 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1497 case Intrinsic::aarch64_sve_eor:
1498 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1499 .setMatchingIROpcode(Instruction::Xor);
1500 case Intrinsic::aarch64_sve_orr:
1501 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1502 .setMatchingIROpcode(Instruction::Or);
1503 case Intrinsic::aarch64_sve_shsub:
1504 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1505 case Intrinsic::aarch64_sve_shsubr:
1507 case Intrinsic::aarch64_sve_sqrshl:
1508 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1509 case Intrinsic::aarch64_sve_sqshl:
1510 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1511 case Intrinsic::aarch64_sve_sqsub:
1512 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1513 case Intrinsic::aarch64_sve_srshl:
1514 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1515 case Intrinsic::aarch64_sve_uhsub:
1516 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1517 case Intrinsic::aarch64_sve_uhsubr:
1519 case Intrinsic::aarch64_sve_uqrshl:
1520 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1521 case Intrinsic::aarch64_sve_uqshl:
1522 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1523 case Intrinsic::aarch64_sve_uqsub:
1524 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1525 case Intrinsic::aarch64_sve_urshl:
1526 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1527
1528 case Intrinsic::aarch64_sve_add_u:
1530 Instruction::Add);
1531 case Intrinsic::aarch64_sve_and_u:
1533 Instruction::And);
1534 case Intrinsic::aarch64_sve_asr_u:
1536 Instruction::AShr);
1537 case Intrinsic::aarch64_sve_eor_u:
1539 Instruction::Xor);
1540 case Intrinsic::aarch64_sve_fadd_u:
1542 Instruction::FAdd);
1543 case Intrinsic::aarch64_sve_fdiv_u:
1545 Instruction::FDiv);
1546 case Intrinsic::aarch64_sve_fmul_u:
1548 Instruction::FMul);
1549 case Intrinsic::aarch64_sve_fsub_u:
1551 Instruction::FSub);
1552 case Intrinsic::aarch64_sve_lsl_u:
1554 Instruction::Shl);
1555 case Intrinsic::aarch64_sve_lsr_u:
1557 Instruction::LShr);
1558 case Intrinsic::aarch64_sve_mul_u:
1560 Instruction::Mul);
1561 case Intrinsic::aarch64_sve_orr_u:
1563 Instruction::Or);
1564 case Intrinsic::aarch64_sve_sdiv_u:
1566 Instruction::SDiv);
1567 case Intrinsic::aarch64_sve_sub_u:
1569 Instruction::Sub);
1570 case Intrinsic::aarch64_sve_udiv_u:
1572 Instruction::UDiv);
1573
1574 case Intrinsic::aarch64_sve_addqv:
1575 case Intrinsic::aarch64_sve_and_z:
1576 case Intrinsic::aarch64_sve_bic_z:
1577 case Intrinsic::aarch64_sve_brka_z:
1578 case Intrinsic::aarch64_sve_brkb_z:
1579 case Intrinsic::aarch64_sve_brkn_z:
1580 case Intrinsic::aarch64_sve_brkpa_z:
1581 case Intrinsic::aarch64_sve_brkpb_z:
1582 case Intrinsic::aarch64_sve_cntp:
1583 case Intrinsic::aarch64_sve_compact:
1584 case Intrinsic::aarch64_sve_eor_z:
1585 case Intrinsic::aarch64_sve_eorv:
1586 case Intrinsic::aarch64_sve_eorqv:
1587 case Intrinsic::aarch64_sve_nand_z:
1588 case Intrinsic::aarch64_sve_nor_z:
1589 case Intrinsic::aarch64_sve_orn_z:
1590 case Intrinsic::aarch64_sve_orr_z:
1591 case Intrinsic::aarch64_sve_orv:
1592 case Intrinsic::aarch64_sve_orqv:
1593 case Intrinsic::aarch64_sve_pnext:
1594 case Intrinsic::aarch64_sve_rdffr_z:
1595 case Intrinsic::aarch64_sve_saddv:
1596 case Intrinsic::aarch64_sve_uaddv:
1597 case Intrinsic::aarch64_sve_umaxv:
1598 case Intrinsic::aarch64_sve_umaxqv:
1599 case Intrinsic::aarch64_sve_cmpeq:
1600 case Intrinsic::aarch64_sve_cmpeq_wide:
1601 case Intrinsic::aarch64_sve_cmpge:
1602 case Intrinsic::aarch64_sve_cmpge_wide:
1603 case Intrinsic::aarch64_sve_cmpgt:
1604 case Intrinsic::aarch64_sve_cmpgt_wide:
1605 case Intrinsic::aarch64_sve_cmphi:
1606 case Intrinsic::aarch64_sve_cmphi_wide:
1607 case Intrinsic::aarch64_sve_cmphs:
1608 case Intrinsic::aarch64_sve_cmphs_wide:
1609 case Intrinsic::aarch64_sve_cmple_wide:
1610 case Intrinsic::aarch64_sve_cmplo_wide:
1611 case Intrinsic::aarch64_sve_cmpls_wide:
1612 case Intrinsic::aarch64_sve_cmplt_wide:
1613 case Intrinsic::aarch64_sve_cmpne:
1614 case Intrinsic::aarch64_sve_cmpne_wide:
1615 case Intrinsic::aarch64_sve_facge:
1616 case Intrinsic::aarch64_sve_facgt:
1617 case Intrinsic::aarch64_sve_fcmpeq:
1618 case Intrinsic::aarch64_sve_fcmpge:
1619 case Intrinsic::aarch64_sve_fcmpgt:
1620 case Intrinsic::aarch64_sve_fcmpne:
1621 case Intrinsic::aarch64_sve_fcmpuo:
1622 case Intrinsic::aarch64_sve_ld1:
1623 case Intrinsic::aarch64_sve_ld1_gather:
1624 case Intrinsic::aarch64_sve_ld1_gather_index:
1625 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1626 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1627 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1628 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1629 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1630 case Intrinsic::aarch64_sve_ld1q_gather_index:
1631 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1632 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1633 case Intrinsic::aarch64_sve_ld1ro:
1634 case Intrinsic::aarch64_sve_ld1rq:
1635 case Intrinsic::aarch64_sve_ld1udq:
1636 case Intrinsic::aarch64_sve_ld1uwq:
1637 case Intrinsic::aarch64_sve_ld2_sret:
1638 case Intrinsic::aarch64_sve_ld2q_sret:
1639 case Intrinsic::aarch64_sve_ld3_sret:
1640 case Intrinsic::aarch64_sve_ld3q_sret:
1641 case Intrinsic::aarch64_sve_ld4_sret:
1642 case Intrinsic::aarch64_sve_ld4q_sret:
1643 case Intrinsic::aarch64_sve_ldff1:
1644 case Intrinsic::aarch64_sve_ldff1_gather:
1645 case Intrinsic::aarch64_sve_ldff1_gather_index:
1646 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1647 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1648 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1649 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1650 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1651 case Intrinsic::aarch64_sve_ldnf1:
1652 case Intrinsic::aarch64_sve_ldnt1:
1653 case Intrinsic::aarch64_sve_ldnt1_gather:
1654 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1655 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1656 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1658
1659 case Intrinsic::aarch64_sve_prf:
1660 case Intrinsic::aarch64_sve_prfb_gather_index:
1661 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1662 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1663 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1664 case Intrinsic::aarch64_sve_prfd_gather_index:
1665 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1666 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1667 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1668 case Intrinsic::aarch64_sve_prfh_gather_index:
1669 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1670 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1671 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1672 case Intrinsic::aarch64_sve_prfw_gather_index:
1673 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1674 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1675 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1677
1678 case Intrinsic::aarch64_sve_st1_scatter:
1679 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1680 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1681 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1682 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1683 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1684 case Intrinsic::aarch64_sve_st1dq:
1685 case Intrinsic::aarch64_sve_st1q_scatter_index:
1686 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1687 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1688 case Intrinsic::aarch64_sve_st1wq:
1689 case Intrinsic::aarch64_sve_stnt1:
1690 case Intrinsic::aarch64_sve_stnt1_scatter:
1691 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1692 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1693 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1695 case Intrinsic::aarch64_sve_st2:
1696 case Intrinsic::aarch64_sve_st2q:
1698 case Intrinsic::aarch64_sve_st3:
1699 case Intrinsic::aarch64_sve_st3q:
1701 case Intrinsic::aarch64_sve_st4:
1702 case Intrinsic::aarch64_sve_st4q:
1704 }
1705
1706 return SVEIntrinsicInfo();
1707}
1708
1709static bool isAllActivePredicate(Value *Pred) {
1710 Value *UncastedPred;
1711
1712 // Look through predicate casts that only remove lanes.
1714 m_Value(UncastedPred)))) {
1715 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1716 Pred = UncastedPred;
1717
1719 m_Value(UncastedPred))))
1720 // If the predicate has the same or less lanes than the uncasted predicate
1721 // then we know the casting has no effect.
1722 if (OrigPredTy->getMinNumElements() <=
1723 cast<ScalableVectorType>(UncastedPred->getType())
1724 ->getMinNumElements())
1725 Pred = UncastedPred;
1726 }
1727
1728 auto *C = dyn_cast<Constant>(Pred);
1729 return C && C->isAllOnesValue();
1730}
1731
1732// Simplify `V` by only considering the operations that affect active lanes.
1733// This function should only return existing Values or newly created Constants.
1734static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1735 auto *Dup = dyn_cast<IntrinsicInst>(V);
1736 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1737 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1739 cast<VectorType>(V->getType())->getElementCount(),
1740 cast<Constant>(Dup->getOperand(2)));
1741
1742 return V;
1743}
1744
1745static std::optional<Instruction *>
1747 const SVEIntrinsicInfo &IInfo) {
1748 const unsigned Opc = IInfo.getMatchingIROpode();
1749 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1750
1751 Value *Pg = II.getOperand(0);
1752 Value *Op1 = II.getOperand(1);
1753 Value *Op2 = II.getOperand(2);
1754 const DataLayout &DL = II.getDataLayout();
1755
1756 // Canonicalise constants to the RHS.
1758 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1759 IC.replaceOperand(II, 1, Op2);
1760 IC.replaceOperand(II, 2, Op1);
1761 return &II;
1762 }
1763
1764 // Only active lanes matter when simplifying the operation.
1765 Op1 = stripInactiveLanes(Op1, Pg);
1766 Op2 = stripInactiveLanes(Op2, Pg);
1767
1768 Value *SimpleII;
1769 if (auto FII = dyn_cast<FPMathOperator>(&II))
1770 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1771 else
1772 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1773
1774 // An SVE intrinsic's result is always defined. However, this is not the case
1775 // for its equivalent IR instruction (e.g. when shifting by an amount more
1776 // than the data's bitwidth). Simplifications to an undefined result must be
1777 // ignored to preserve the intrinsic's expected behaviour.
1778 if (!SimpleII || isa<UndefValue>(SimpleII))
1779 return std::nullopt;
1780
1781 if (IInfo.inactiveLanesAreNotDefined())
1782 return IC.replaceInstUsesWith(II, SimpleII);
1783
1784 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1785
1786 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1787 if (SimpleII == Inactive)
1788 return IC.replaceInstUsesWith(II, SimpleII);
1789
1790 // Inactive lanes must be preserved.
1791 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1792 return IC.replaceInstUsesWith(II, SimpleII);
1793}
1794
1795// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1796// to operations with less strict inactive lane requirements.
1797static std::optional<Instruction *>
1799 const SVEIntrinsicInfo &IInfo) {
1800 if (!IInfo.hasGoverningPredicate())
1801 return std::nullopt;
1802
1803 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1804
1805 // If there are no active lanes.
1806 if (match(OpPredicate, m_ZeroInt())) {
1808 return IC.replaceInstUsesWith(
1809 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1810
1811 if (IInfo.inactiveLanesAreUnused()) {
1812 if (IInfo.resultIsZeroInitialized())
1814
1815 return IC.eraseInstFromFunction(II);
1816 }
1817 }
1818
1819 // If there are no inactive lanes.
1820 if (isAllActivePredicate(OpPredicate)) {
1821 if (IInfo.hasOperandWithNoActiveLanes()) {
1822 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1823 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1824 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1825 }
1826
1827 if (IInfo.hasMatchingUndefIntrinsic()) {
1828 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1829 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1830 II.setCalledFunction(NewDecl);
1831 return &II;
1832 }
1833 }
1834
1835 // Operation specific simplifications.
1836 if (IInfo.hasMatchingIROpode() &&
1838 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1839
1840 return std::nullopt;
1841}
1842
1843// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1844// => (binop (pred) (from_svbool _) (from_svbool _))
1845//
1846// The above transformation eliminates a `to_svbool` in the predicate
1847// operand of bitwise operation `binop` by narrowing the vector width of
1848// the operation. For example, it would convert a `<vscale x 16 x i1>
1849// and` into a `<vscale x 4 x i1> and`. This is profitable because
1850// to_svbool must zero the new lanes during widening, whereas
1851// from_svbool is free.
1852static std::optional<Instruction *>
1854 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1855 if (!BinOp)
1856 return std::nullopt;
1857
1858 auto IntrinsicID = BinOp->getIntrinsicID();
1859 switch (IntrinsicID) {
1860 case Intrinsic::aarch64_sve_and_z:
1861 case Intrinsic::aarch64_sve_bic_z:
1862 case Intrinsic::aarch64_sve_eor_z:
1863 case Intrinsic::aarch64_sve_nand_z:
1864 case Intrinsic::aarch64_sve_nor_z:
1865 case Intrinsic::aarch64_sve_orn_z:
1866 case Intrinsic::aarch64_sve_orr_z:
1867 break;
1868 default:
1869 return std::nullopt;
1870 }
1871
1872 auto BinOpPred = BinOp->getOperand(0);
1873 auto BinOpOp1 = BinOp->getOperand(1);
1874 auto BinOpOp2 = BinOp->getOperand(2);
1875
1876 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1877 if (!PredIntr ||
1878 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1879 return std::nullopt;
1880
1881 auto PredOp = PredIntr->getOperand(0);
1882 auto PredOpTy = cast<VectorType>(PredOp->getType());
1883 if (PredOpTy != II.getType())
1884 return std::nullopt;
1885
1886 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1887 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1888 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1889 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1890 if (BinOpOp1 == BinOpOp2)
1891 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1892 else
1893 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1894 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1895
1896 auto NarrowedBinOp =
1897 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1898 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1899}
1900
1901static std::optional<Instruction *>
1903 // If the reinterpret instruction operand is a PHI Node
1904 if (isa<PHINode>(II.getArgOperand(0)))
1905 return processPhiNode(IC, II);
1906
1907 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1908 return BinOpCombine;
1909
1910 // Ignore converts to/from svcount_t.
1911 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1912 isa<TargetExtType>(II.getType()))
1913 return std::nullopt;
1914
1915 SmallVector<Instruction *, 32> CandidatesForRemoval;
1916 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1917
1918 const auto *IVTy = cast<VectorType>(II.getType());
1919
1920 // Walk the chain of conversions.
1921 while (Cursor) {
1922 // If the type of the cursor has fewer lanes than the final result, zeroing
1923 // must take place, which breaks the equivalence chain.
1924 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1925 if (CursorVTy->getElementCount().getKnownMinValue() <
1926 IVTy->getElementCount().getKnownMinValue())
1927 break;
1928
1929 // If the cursor has the same type as I, it is a viable replacement.
1930 if (Cursor->getType() == IVTy)
1931 EarliestReplacement = Cursor;
1932
1933 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1934
1935 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1936 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1937 Intrinsic::aarch64_sve_convert_to_svbool ||
1938 IntrinsicCursor->getIntrinsicID() ==
1939 Intrinsic::aarch64_sve_convert_from_svbool))
1940 break;
1941
1942 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1943 Cursor = IntrinsicCursor->getOperand(0);
1944 }
1945
1946 // If no viable replacement in the conversion chain was found, there is
1947 // nothing to do.
1948 if (!EarliestReplacement)
1949 return std::nullopt;
1950
1951 return IC.replaceInstUsesWith(II, EarliestReplacement);
1952}
1953
1954static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1955 IntrinsicInst &II) {
1956 // svsel(ptrue, x, y) => x
1957 auto *OpPredicate = II.getOperand(0);
1958 if (isAllActivePredicate(OpPredicate))
1959 return IC.replaceInstUsesWith(II, II.getOperand(1));
1960
1961 auto Select =
1962 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1963 return IC.replaceInstUsesWith(II, Select);
1964}
1965
1966static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1967 IntrinsicInst &II) {
1968 Value *Pg = II.getOperand(1);
1969
1970 // sve.dup(V, all_active, X) ==> splat(X)
1971 if (isAllActivePredicate(Pg)) {
1972 auto *RetTy = cast<ScalableVectorType>(II.getType());
1973 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1974 II.getArgOperand(2));
1975 return IC.replaceInstUsesWith(II, Splat);
1976 }
1977
1979 m_SpecificInt(AArch64SVEPredPattern::vl1))))
1980 return std::nullopt;
1981
1982 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1983 Value *Insert = IC.Builder.CreateInsertElement(
1984 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
1985 return IC.replaceInstUsesWith(II, Insert);
1986}
1987
1988static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1989 IntrinsicInst &II) {
1990 // Replace DupX with a regular IR splat.
1991 auto *RetTy = cast<ScalableVectorType>(II.getType());
1992 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1993 II.getArgOperand(0));
1994 Splat->takeName(&II);
1995 return IC.replaceInstUsesWith(II, Splat);
1996}
1997
1998static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1999 IntrinsicInst &II) {
2000 LLVMContext &Ctx = II.getContext();
2001
2002 if (!isAllActivePredicate(II.getArgOperand(0)))
2003 return std::nullopt;
2004
2005 // Check that we have a compare of zero..
2006 auto *SplatValue =
2008 if (!SplatValue || !SplatValue->isZero())
2009 return std::nullopt;
2010
2011 // ..against a dupq
2012 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2013 if (!DupQLane ||
2014 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2015 return std::nullopt;
2016
2017 // Where the dupq is a lane 0 replicate of a vector insert
2018 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2019 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2020 return std::nullopt;
2021
2022 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2023 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2024 return std::nullopt;
2025
2026 // Where the vector insert is a fixed constant vector insert into undef at
2027 // index zero
2028 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2029 return std::nullopt;
2030
2031 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2032 return std::nullopt;
2033
2034 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2035 if (!ConstVec)
2036 return std::nullopt;
2037
2038 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2039 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2040 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2041 return std::nullopt;
2042
2043 unsigned NumElts = VecTy->getNumElements();
2044 unsigned PredicateBits = 0;
2045
2046 // Expand intrinsic operands to a 16-bit byte level predicate
2047 for (unsigned I = 0; I < NumElts; ++I) {
2048 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2049 if (!Arg)
2050 return std::nullopt;
2051 if (!Arg->isZero())
2052 PredicateBits |= 1 << (I * (16 / NumElts));
2053 }
2054
2055 // If all bits are zero bail early with an empty predicate
2056 if (PredicateBits == 0) {
2057 auto *PFalse = Constant::getNullValue(II.getType());
2058 PFalse->takeName(&II);
2059 return IC.replaceInstUsesWith(II, PFalse);
2060 }
2061
2062 // Calculate largest predicate type used (where byte predicate is largest)
2063 unsigned Mask = 8;
2064 for (unsigned I = 0; I < 16; ++I)
2065 if ((PredicateBits & (1 << I)) != 0)
2066 Mask |= (I % 8);
2067
2068 unsigned PredSize = Mask & -Mask;
2069 auto *PredType = ScalableVectorType::get(
2070 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2071
2072 // Ensure all relevant bits are set
2073 for (unsigned I = 0; I < 16; I += PredSize)
2074 if ((PredicateBits & (1 << I)) == 0)
2075 return std::nullopt;
2076
2077 auto *PTruePat =
2078 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2079 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2080 {PredType}, {PTruePat});
2081 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2082 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2083 auto *ConvertFromSVBool =
2084 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2085 {II.getType()}, {ConvertToSVBool});
2086
2087 ConvertFromSVBool->takeName(&II);
2088 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2089}
2090
2091static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2092 IntrinsicInst &II) {
2093 Value *Pg = II.getArgOperand(0);
2094 Value *Vec = II.getArgOperand(1);
2095 auto IntrinsicID = II.getIntrinsicID();
2096 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2097
2098 // lastX(splat(X)) --> X
2099 if (auto *SplatVal = getSplatValue(Vec))
2100 return IC.replaceInstUsesWith(II, SplatVal);
2101
2102 // If x and/or y is a splat value then:
2103 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2104 Value *LHS, *RHS;
2105 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2106 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2107 auto *OldBinOp = cast<BinaryOperator>(Vec);
2108 auto OpC = OldBinOp->getOpcode();
2109 auto *NewLHS =
2110 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2111 auto *NewRHS =
2112 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2114 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2115 return IC.replaceInstUsesWith(II, NewBinOp);
2116 }
2117 }
2118
2119 auto *C = dyn_cast<Constant>(Pg);
2120 if (IsAfter && C && C->isNullValue()) {
2121 // The intrinsic is extracting lane 0 so use an extract instead.
2122 auto *IdxTy = Type::getInt64Ty(II.getContext());
2123 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2124 Extract->insertBefore(II.getIterator());
2125 Extract->takeName(&II);
2126 return IC.replaceInstUsesWith(II, Extract);
2127 }
2128
2129 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2130 if (!IntrPG)
2131 return std::nullopt;
2132
2133 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2134 return std::nullopt;
2135
2136 const auto PTruePattern =
2137 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2138
2139 // Can the intrinsic's predicate be converted to a known constant index?
2140 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2141 if (!MinNumElts)
2142 return std::nullopt;
2143
2144 unsigned Idx = MinNumElts - 1;
2145 // Increment the index if extracting the element after the last active
2146 // predicate element.
2147 if (IsAfter)
2148 ++Idx;
2149
2150 // Ignore extracts whose index is larger than the known minimum vector
2151 // length. NOTE: This is an artificial constraint where we prefer to
2152 // maintain what the user asked for until an alternative is proven faster.
2153 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2154 if (Idx >= PgVTy->getMinNumElements())
2155 return std::nullopt;
2156
2157 // The intrinsic is extracting a fixed lane so use an extract instead.
2158 auto *IdxTy = Type::getInt64Ty(II.getContext());
2159 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2160 Extract->insertBefore(II.getIterator());
2161 Extract->takeName(&II);
2162 return IC.replaceInstUsesWith(II, Extract);
2163}
2164
2165static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2166 IntrinsicInst &II) {
2167 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2168 // integer variant across a variety of micro-architectures. Replace scalar
2169 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2170 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2171 // depending on the micro-architecture, but has been observed as generally
2172 // being faster, particularly when the CLAST[AB] op is a loop-carried
2173 // dependency.
2174 Value *Pg = II.getArgOperand(0);
2175 Value *Fallback = II.getArgOperand(1);
2176 Value *Vec = II.getArgOperand(2);
2177 Type *Ty = II.getType();
2178
2179 if (!Ty->isIntegerTy())
2180 return std::nullopt;
2181
2182 Type *FPTy;
2183 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2184 default:
2185 return std::nullopt;
2186 case 16:
2187 FPTy = IC.Builder.getHalfTy();
2188 break;
2189 case 32:
2190 FPTy = IC.Builder.getFloatTy();
2191 break;
2192 case 64:
2193 FPTy = IC.Builder.getDoubleTy();
2194 break;
2195 }
2196
2197 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2198 auto *FPVTy = VectorType::get(
2199 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2200 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2201 auto *FPII = IC.Builder.CreateIntrinsic(
2202 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2203 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2204 return IC.replaceInstUsesWith(II, FPIItoInt);
2205}
2206
2207static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2208 IntrinsicInst &II) {
2209 LLVMContext &Ctx = II.getContext();
2210 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2211 // can work with RDFFR_PP for ptest elimination.
2212 auto *AllPat =
2213 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2214 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2215 {II.getType()}, {AllPat});
2216 auto *RDFFR =
2217 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2218 RDFFR->takeName(&II);
2219 return IC.replaceInstUsesWith(II, RDFFR);
2220}
2221
2222static std::optional<Instruction *>
2224 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2225
2226 if (Pattern == AArch64SVEPredPattern::all) {
2228 II.getType(), ElementCount::getScalable(NumElts));
2229 Cnt->takeName(&II);
2230 return IC.replaceInstUsesWith(II, Cnt);
2231 }
2232
2233 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2234
2235 return MinNumElts && NumElts >= MinNumElts
2236 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2237 II, ConstantInt::get(II.getType(), MinNumElts)))
2238 : std::nullopt;
2239}
2240
2241static std::optional<Instruction *>
2243 const AArch64Subtarget *ST) {
2244 if (!ST->isStreaming())
2245 return std::nullopt;
2246
2247 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2248 // with SVEPredPattern::all
2249 Value *Cnt =
2251 Cnt->takeName(&II);
2252 return IC.replaceInstUsesWith(II, Cnt);
2253}
2254
2255static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2256 IntrinsicInst &II) {
2257 Value *PgVal = II.getArgOperand(0);
2258 Value *OpVal = II.getArgOperand(1);
2259
2260 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2261 // Later optimizations prefer this form.
2262 if (PgVal == OpVal &&
2263 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2264 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2265 Value *Ops[] = {PgVal, OpVal};
2266 Type *Tys[] = {PgVal->getType()};
2267
2268 auto *PTest =
2269 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2270 PTest->takeName(&II);
2271
2272 return IC.replaceInstUsesWith(II, PTest);
2273 }
2274
2277
2278 if (!Pg || !Op)
2279 return std::nullopt;
2280
2281 Intrinsic::ID OpIID = Op->getIntrinsicID();
2282
2283 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2284 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2285 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2286 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2287 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2288
2289 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2290
2291 PTest->takeName(&II);
2292 return IC.replaceInstUsesWith(II, PTest);
2293 }
2294
2295 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2296 // Later optimizations may rewrite sequence to use the flag-setting variant
2297 // of instruction X to remove PTEST.
2298 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2299 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2300 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2301 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2302 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2303 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2304 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2305 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2306 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2307 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2308 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2309 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2310 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2311 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2312 Type *Tys[] = {Pg->getType()};
2313
2314 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2315 PTest->takeName(&II);
2316
2317 return IC.replaceInstUsesWith(II, PTest);
2318 }
2319
2320 return std::nullopt;
2321}
2322
2323template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2324static std::optional<Instruction *>
2326 bool MergeIntoAddendOp) {
2327 Value *P = II.getOperand(0);
2328 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2329 if (MergeIntoAddendOp) {
2330 AddendOp = II.getOperand(1);
2331 Mul = II.getOperand(2);
2332 } else {
2333 AddendOp = II.getOperand(2);
2334 Mul = II.getOperand(1);
2335 }
2336
2338 m_Value(MulOp1))))
2339 return std::nullopt;
2340
2341 if (!Mul->hasOneUse())
2342 return std::nullopt;
2343
2344 Instruction *FMFSource = nullptr;
2345 if (II.getType()->isFPOrFPVectorTy()) {
2346 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2347 // Stop the combine when the flags on the inputs differ in case dropping
2348 // flags would lead to us missing out on more beneficial optimizations.
2349 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2350 return std::nullopt;
2351 if (!FAddFlags.allowContract())
2352 return std::nullopt;
2353 FMFSource = &II;
2354 }
2355
2356 CallInst *Res;
2357 if (MergeIntoAddendOp)
2358 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2359 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2360 else
2361 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2362 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2363
2364 return IC.replaceInstUsesWith(II, Res);
2365}
2366
2367static std::optional<Instruction *>
2369 Value *Pred = II.getOperand(0);
2370 Value *PtrOp = II.getOperand(1);
2371 Type *VecTy = II.getType();
2372
2373 if (isAllActivePredicate(Pred)) {
2374 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2375 Load->copyMetadata(II);
2376 return IC.replaceInstUsesWith(II, Load);
2377 }
2378
2379 CallInst *MaskedLoad =
2380 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2381 Pred, ConstantAggregateZero::get(VecTy));
2382 MaskedLoad->copyMetadata(II);
2383 return IC.replaceInstUsesWith(II, MaskedLoad);
2384}
2385
2386static std::optional<Instruction *>
2388 Value *VecOp = II.getOperand(0);
2389 Value *Pred = II.getOperand(1);
2390 Value *PtrOp = II.getOperand(2);
2391
2392 if (isAllActivePredicate(Pred)) {
2393 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2394 Store->copyMetadata(II);
2395 return IC.eraseInstFromFunction(II);
2396 }
2397
2398 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2399 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2400 MaskedStore->copyMetadata(II);
2401 return IC.eraseInstFromFunction(II);
2402}
2403
2405 switch (Intrinsic) {
2406 case Intrinsic::aarch64_sve_fmul_u:
2407 return Instruction::BinaryOps::FMul;
2408 case Intrinsic::aarch64_sve_fadd_u:
2409 return Instruction::BinaryOps::FAdd;
2410 case Intrinsic::aarch64_sve_fsub_u:
2411 return Instruction::BinaryOps::FSub;
2412 default:
2413 return Instruction::BinaryOpsEnd;
2414 }
2415}
2416
2417static std::optional<Instruction *>
2419 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2420 if (II.isStrictFP())
2421 return std::nullopt;
2422
2423 auto *OpPredicate = II.getOperand(0);
2424 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2425 if (BinOpCode == Instruction::BinaryOpsEnd ||
2426 !isAllActivePredicate(OpPredicate))
2427 return std::nullopt;
2428 auto BinOp = IC.Builder.CreateBinOpFMF(
2429 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2430 return IC.replaceInstUsesWith(II, BinOp);
2431}
2432
2433static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2434 IntrinsicInst &II) {
2435 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2436 Intrinsic::aarch64_sve_mla>(
2437 IC, II, true))
2438 return MLA;
2439 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2440 Intrinsic::aarch64_sve_mad>(
2441 IC, II, false))
2442 return MAD;
2443 return std::nullopt;
2444}
2445
2446static std::optional<Instruction *>
2448 if (auto FMLA =
2449 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2450 Intrinsic::aarch64_sve_fmla>(IC, II,
2451 true))
2452 return FMLA;
2453 if (auto FMAD =
2454 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2455 Intrinsic::aarch64_sve_fmad>(IC, II,
2456 false))
2457 return FMAD;
2458 if (auto FMLA =
2459 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2460 Intrinsic::aarch64_sve_fmla>(IC, II,
2461 true))
2462 return FMLA;
2463 return std::nullopt;
2464}
2465
2466static std::optional<Instruction *>
2468 if (auto FMLA =
2469 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2470 Intrinsic::aarch64_sve_fmla>(IC, II,
2471 true))
2472 return FMLA;
2473 if (auto FMAD =
2474 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2475 Intrinsic::aarch64_sve_fmad>(IC, II,
2476 false))
2477 return FMAD;
2478 if (auto FMLA_U =
2479 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2480 Intrinsic::aarch64_sve_fmla_u>(
2481 IC, II, true))
2482 return FMLA_U;
2483 return instCombineSVEVectorBinOp(IC, II);
2484}
2485
2486static std::optional<Instruction *>
2488 if (auto FMLS =
2489 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2490 Intrinsic::aarch64_sve_fmls>(IC, II,
2491 true))
2492 return FMLS;
2493 if (auto FMSB =
2494 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2495 Intrinsic::aarch64_sve_fnmsb>(
2496 IC, II, false))
2497 return FMSB;
2498 if (auto FMLS =
2499 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2500 Intrinsic::aarch64_sve_fmls>(IC, II,
2501 true))
2502 return FMLS;
2503 return std::nullopt;
2504}
2505
2506static std::optional<Instruction *>
2508 if (auto FMLS =
2509 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2510 Intrinsic::aarch64_sve_fmls>(IC, II,
2511 true))
2512 return FMLS;
2513 if (auto FMSB =
2514 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2515 Intrinsic::aarch64_sve_fnmsb>(
2516 IC, II, false))
2517 return FMSB;
2518 if (auto FMLS_U =
2519 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2520 Intrinsic::aarch64_sve_fmls_u>(
2521 IC, II, true))
2522 return FMLS_U;
2523 return instCombineSVEVectorBinOp(IC, II);
2524}
2525
2526static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2527 IntrinsicInst &II) {
2528 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2529 Intrinsic::aarch64_sve_mls>(
2530 IC, II, true))
2531 return MLS;
2532 return std::nullopt;
2533}
2534
2535static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2536 IntrinsicInst &II) {
2537 Value *UnpackArg = II.getArgOperand(0);
2538 auto *RetTy = cast<ScalableVectorType>(II.getType());
2539 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2540 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2541
2542 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2543 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2544 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2545 ScalarArg =
2546 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2547 Value *NewVal =
2548 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2549 NewVal->takeName(&II);
2550 return IC.replaceInstUsesWith(II, NewVal);
2551 }
2552
2553 return std::nullopt;
2554}
2555static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2556 IntrinsicInst &II) {
2557 auto *OpVal = II.getOperand(0);
2558 auto *OpIndices = II.getOperand(1);
2559 VectorType *VTy = cast<VectorType>(II.getType());
2560
2561 // Check whether OpIndices is a constant splat value < minimal element count
2562 // of result.
2563 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2564 if (!SplatValue ||
2565 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2566 return std::nullopt;
2567
2568 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2569 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2570 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2571 auto *VectorSplat =
2572 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2573
2574 VectorSplat->takeName(&II);
2575 return IC.replaceInstUsesWith(II, VectorSplat);
2576}
2577
2578static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2579 IntrinsicInst &II) {
2580 Value *A, *B;
2581 Type *RetTy = II.getType();
2582 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2583 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2584
2585 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2586 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2587 if ((match(II.getArgOperand(0),
2589 match(II.getArgOperand(1),
2591 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2592 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2593 auto *TyA = cast<ScalableVectorType>(A->getType());
2594 if (TyA == B->getType() &&
2596 auto *SubVec = IC.Builder.CreateInsertVector(
2597 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2598 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2599 TyA->getMinNumElements());
2600 ConcatVec->takeName(&II);
2601 return IC.replaceInstUsesWith(II, ConcatVec);
2602 }
2603 }
2604
2605 return std::nullopt;
2606}
2607
2608static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2609 IntrinsicInst &II) {
2610 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2611 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2612 Value *A, *B;
2613 if (match(II.getArgOperand(0),
2616 m_Specific(A), m_Specific(B))))
2617 return IC.replaceInstUsesWith(
2618 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2619
2620 return std::nullopt;
2621}
2622
2623static std::optional<Instruction *>
2625 Value *Mask = II.getOperand(0);
2626 Value *BasePtr = II.getOperand(1);
2627 Value *Index = II.getOperand(2);
2628 Type *Ty = II.getType();
2629 Value *PassThru = ConstantAggregateZero::get(Ty);
2630
2631 // Contiguous gather => masked load.
2632 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2633 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2634 Value *IndexBase;
2636 m_Value(IndexBase), m_SpecificInt(1)))) {
2637 Align Alignment =
2638 BasePtr->getPointerAlignment(II.getDataLayout());
2639
2640 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2641 BasePtr, IndexBase);
2642 CallInst *MaskedLoad =
2643 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2644 MaskedLoad->takeName(&II);
2645 return IC.replaceInstUsesWith(II, MaskedLoad);
2646 }
2647
2648 return std::nullopt;
2649}
2650
2651static std::optional<Instruction *>
2653 Value *Val = II.getOperand(0);
2654 Value *Mask = II.getOperand(1);
2655 Value *BasePtr = II.getOperand(2);
2656 Value *Index = II.getOperand(3);
2657 Type *Ty = Val->getType();
2658
2659 // Contiguous scatter => masked store.
2660 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2661 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2662 Value *IndexBase;
2664 m_Value(IndexBase), m_SpecificInt(1)))) {
2665 Align Alignment =
2666 BasePtr->getPointerAlignment(II.getDataLayout());
2667
2668 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2669 BasePtr, IndexBase);
2670 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2671
2672 return IC.eraseInstFromFunction(II);
2673 }
2674
2675 return std::nullopt;
2676}
2677
2678static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2679 IntrinsicInst &II) {
2681 Value *Pred = II.getOperand(0);
2682 Value *Vec = II.getOperand(1);
2683 Value *DivVec = II.getOperand(2);
2684
2685 Value *SplatValue = getSplatValue(DivVec);
2686 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2687 if (!SplatConstantInt)
2688 return std::nullopt;
2689
2690 APInt Divisor = SplatConstantInt->getValue();
2691 const int64_t DivisorValue = Divisor.getSExtValue();
2692 if (DivisorValue == -1)
2693 return std::nullopt;
2694 if (DivisorValue == 1)
2695 IC.replaceInstUsesWith(II, Vec);
2696
2697 if (Divisor.isPowerOf2()) {
2698 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2699 auto ASRD = IC.Builder.CreateIntrinsic(
2700 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2701 return IC.replaceInstUsesWith(II, ASRD);
2702 }
2703 if (Divisor.isNegatedPowerOf2()) {
2704 Divisor.negate();
2705 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2706 auto ASRD = IC.Builder.CreateIntrinsic(
2707 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2708 auto NEG = IC.Builder.CreateIntrinsic(
2709 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2710 return IC.replaceInstUsesWith(II, NEG);
2711 }
2712
2713 return std::nullopt;
2714}
2715
2716bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2717 size_t VecSize = Vec.size();
2718 if (VecSize == 1)
2719 return true;
2720 if (!isPowerOf2_64(VecSize))
2721 return false;
2722 size_t HalfVecSize = VecSize / 2;
2723
2724 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2725 RHS != Vec.end(); LHS++, RHS++) {
2726 if (*LHS != nullptr && *RHS != nullptr) {
2727 if (*LHS == *RHS)
2728 continue;
2729 else
2730 return false;
2731 }
2732 if (!AllowPoison)
2733 return false;
2734 if (*LHS == nullptr && *RHS != nullptr)
2735 *LHS = *RHS;
2736 }
2737
2738 Vec.resize(HalfVecSize);
2739 SimplifyValuePattern(Vec, AllowPoison);
2740 return true;
2741}
2742
2743// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2744// to dupqlane(f64(C)) where C is A concatenated with B
2745static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2746 IntrinsicInst &II) {
2747 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2748 if (!match(II.getOperand(0),
2750 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2751 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2752 return std::nullopt;
2753 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2754
2755 // Insert the scalars into a container ordered by InsertElement index
2756 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2757 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2758 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2759 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2760 CurrentInsertElt = InsertElt->getOperand(0);
2761 }
2762
2763 bool AllowPoison =
2764 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2765 if (!SimplifyValuePattern(Elts, AllowPoison))
2766 return std::nullopt;
2767
2768 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2769 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2770 for (size_t I = 0; I < Elts.size(); I++) {
2771 if (Elts[I] == nullptr)
2772 continue;
2773 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2774 IC.Builder.getInt64(I));
2775 }
2776 if (InsertEltChain == nullptr)
2777 return std::nullopt;
2778
2779 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2780 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2781 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2782 // be narrowed back to the original type.
2783 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2784 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2785 IIScalableTy->getMinNumElements() /
2786 PatternWidth;
2787
2788 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2789 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2790 auto *WideShuffleMaskTy =
2791 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2792
2793 auto InsertSubvector = IC.Builder.CreateInsertVector(
2794 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2795 uint64_t(0));
2796 auto WideBitcast =
2797 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2798 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2799 auto WideShuffle = IC.Builder.CreateShuffleVector(
2800 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2801 auto NarrowBitcast =
2802 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2803
2804 return IC.replaceInstUsesWith(II, NarrowBitcast);
2805}
2806
2807static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2808 IntrinsicInst &II) {
2809 Value *A = II.getArgOperand(0);
2810 Value *B = II.getArgOperand(1);
2811 if (A == B)
2812 return IC.replaceInstUsesWith(II, A);
2813
2814 return std::nullopt;
2815}
2816
2817static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2818 IntrinsicInst &II) {
2819 Value *Pred = II.getOperand(0);
2820 Value *Vec = II.getOperand(1);
2821 Value *Shift = II.getOperand(2);
2822
2823 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2824 Value *AbsPred, *MergedValue;
2826 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2828 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2829
2830 return std::nullopt;
2831
2832 // Transform is valid if any of the following are true:
2833 // * The ABS merge value is an undef or non-negative
2834 // * The ABS predicate is all active
2835 // * The ABS predicate and the SRSHL predicates are the same
2836 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2837 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2838 return std::nullopt;
2839
2840 // Only valid when the shift amount is non-negative, otherwise the rounding
2841 // behaviour of SRSHL cannot be ignored.
2842 if (!match(Shift, m_NonNegative()))
2843 return std::nullopt;
2844
2845 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2846 {II.getType()}, {Pred, Vec, Shift});
2847
2848 return IC.replaceInstUsesWith(II, LSL);
2849}
2850
2851static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2852 IntrinsicInst &II) {
2853 Value *Vec = II.getOperand(0);
2854
2855 if (getSplatValue(Vec) == II.getOperand(1))
2856 return IC.replaceInstUsesWith(II, Vec);
2857
2858 return std::nullopt;
2859}
2860
2861static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2862 IntrinsicInst &II) {
2863 // If this barrier is post-dominated by identical one we can remove it
2864 auto *NI = II.getNextNode();
2865 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2866 auto CanSkipOver = [](Instruction *I) {
2867 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2868 };
2869 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2870 auto *NIBB = NI->getParent();
2871 NI = NI->getNextNode();
2872 if (!NI) {
2873 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2874 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2875 else
2876 break;
2877 }
2878 }
2879 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2880 if (NextII && II.isIdenticalTo(NextII))
2881 return IC.eraseInstFromFunction(II);
2882
2883 return std::nullopt;
2884}
2885
2886static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2887 IntrinsicInst &II) {
2888 return IC.replaceInstUsesWith(
2889 II,
2890 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2891 {II.getType(), II.getOperand(0)->getType()},
2892 {II.getOperand(0), II.getOperand(1)}));
2893}
2894
2895static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2896 IntrinsicInst &II) {
2898 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2899 return std::nullopt;
2900}
2901
2902static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2904 unsigned NumBits) {
2905 Value *Passthru = II.getOperand(0);
2906 Value *Pg = II.getOperand(1);
2907 Value *Op = II.getOperand(2);
2908
2909 // Convert UXT[BHW] to AND.
2910 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2911 auto *Ty = cast<VectorType>(II.getType());
2912 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2913 auto *Mask = ConstantInt::get(Ty, MaskValue);
2914 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2915 {Pg, Op, Mask});
2916 return IC.replaceInstUsesWith(II, And);
2917 }
2918
2919 return std::nullopt;
2920}
2921
2922static std::optional<Instruction *>
2924 SMEAttrs FnSMEAttrs(*II.getFunction());
2925 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2926 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2927 return IC.replaceInstUsesWith(
2928 II, ConstantInt::getBool(II.getType(), IsStreaming));
2929 return std::nullopt;
2930}
2931
2932std::optional<Instruction *>
2934 IntrinsicInst &II) const {
2936 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2937 return I;
2938
2939 Intrinsic::ID IID = II.getIntrinsicID();
2940 switch (IID) {
2941 default:
2942 break;
2943 case Intrinsic::aarch64_dmb:
2944 return instCombineDMB(IC, II);
2945 case Intrinsic::aarch64_neon_fmaxnm:
2946 case Intrinsic::aarch64_neon_fminnm:
2947 return instCombineMaxMinNM(IC, II);
2948 case Intrinsic::aarch64_sve_convert_from_svbool:
2949 return instCombineConvertFromSVBool(IC, II);
2950 case Intrinsic::aarch64_sve_dup:
2951 return instCombineSVEDup(IC, II);
2952 case Intrinsic::aarch64_sve_dup_x:
2953 return instCombineSVEDupX(IC, II);
2954 case Intrinsic::aarch64_sve_cmpne:
2955 case Intrinsic::aarch64_sve_cmpne_wide:
2956 return instCombineSVECmpNE(IC, II);
2957 case Intrinsic::aarch64_sve_rdffr:
2958 return instCombineRDFFR(IC, II);
2959 case Intrinsic::aarch64_sve_lasta:
2960 case Intrinsic::aarch64_sve_lastb:
2961 return instCombineSVELast(IC, II);
2962 case Intrinsic::aarch64_sve_clasta_n:
2963 case Intrinsic::aarch64_sve_clastb_n:
2964 return instCombineSVECondLast(IC, II);
2965 case Intrinsic::aarch64_sve_cntd:
2966 return instCombineSVECntElts(IC, II, 2);
2967 case Intrinsic::aarch64_sve_cntw:
2968 return instCombineSVECntElts(IC, II, 4);
2969 case Intrinsic::aarch64_sve_cnth:
2970 return instCombineSVECntElts(IC, II, 8);
2971 case Intrinsic::aarch64_sve_cntb:
2972 return instCombineSVECntElts(IC, II, 16);
2973 case Intrinsic::aarch64_sme_cntsd:
2974 return instCombineSMECntsd(IC, II, ST);
2975 case Intrinsic::aarch64_sve_ptest_any:
2976 case Intrinsic::aarch64_sve_ptest_first:
2977 case Intrinsic::aarch64_sve_ptest_last:
2978 return instCombineSVEPTest(IC, II);
2979 case Intrinsic::aarch64_sve_fadd:
2980 return instCombineSVEVectorFAdd(IC, II);
2981 case Intrinsic::aarch64_sve_fadd_u:
2982 return instCombineSVEVectorFAddU(IC, II);
2983 case Intrinsic::aarch64_sve_fmul_u:
2984 return instCombineSVEVectorBinOp(IC, II);
2985 case Intrinsic::aarch64_sve_fsub:
2986 return instCombineSVEVectorFSub(IC, II);
2987 case Intrinsic::aarch64_sve_fsub_u:
2988 return instCombineSVEVectorFSubU(IC, II);
2989 case Intrinsic::aarch64_sve_add:
2990 return instCombineSVEVectorAdd(IC, II);
2991 case Intrinsic::aarch64_sve_add_u:
2992 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2993 Intrinsic::aarch64_sve_mla_u>(
2994 IC, II, true);
2995 case Intrinsic::aarch64_sve_sub:
2996 return instCombineSVEVectorSub(IC, II);
2997 case Intrinsic::aarch64_sve_sub_u:
2998 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2999 Intrinsic::aarch64_sve_mls_u>(
3000 IC, II, true);
3001 case Intrinsic::aarch64_sve_tbl:
3002 return instCombineSVETBL(IC, II);
3003 case Intrinsic::aarch64_sve_uunpkhi:
3004 case Intrinsic::aarch64_sve_uunpklo:
3005 case Intrinsic::aarch64_sve_sunpkhi:
3006 case Intrinsic::aarch64_sve_sunpklo:
3007 return instCombineSVEUnpack(IC, II);
3008 case Intrinsic::aarch64_sve_uzp1:
3009 return instCombineSVEUzp1(IC, II);
3010 case Intrinsic::aarch64_sve_zip1:
3011 case Intrinsic::aarch64_sve_zip2:
3012 return instCombineSVEZip(IC, II);
3013 case Intrinsic::aarch64_sve_ld1_gather_index:
3014 return instCombineLD1GatherIndex(IC, II);
3015 case Intrinsic::aarch64_sve_st1_scatter_index:
3016 return instCombineST1ScatterIndex(IC, II);
3017 case Intrinsic::aarch64_sve_ld1:
3018 return instCombineSVELD1(IC, II, DL);
3019 case Intrinsic::aarch64_sve_st1:
3020 return instCombineSVEST1(IC, II, DL);
3021 case Intrinsic::aarch64_sve_sdiv:
3022 return instCombineSVESDIV(IC, II);
3023 case Intrinsic::aarch64_sve_sel:
3024 return instCombineSVESel(IC, II);
3025 case Intrinsic::aarch64_sve_srshl:
3026 return instCombineSVESrshl(IC, II);
3027 case Intrinsic::aarch64_sve_dupq_lane:
3028 return instCombineSVEDupqLane(IC, II);
3029 case Intrinsic::aarch64_sve_insr:
3030 return instCombineSVEInsr(IC, II);
3031 case Intrinsic::aarch64_sve_whilelo:
3032 return instCombineWhilelo(IC, II);
3033 case Intrinsic::aarch64_sve_ptrue:
3034 return instCombinePTrue(IC, II);
3035 case Intrinsic::aarch64_sve_uxtb:
3036 return instCombineSVEUxt(IC, II, 8);
3037 case Intrinsic::aarch64_sve_uxth:
3038 return instCombineSVEUxt(IC, II, 16);
3039 case Intrinsic::aarch64_sve_uxtw:
3040 return instCombineSVEUxt(IC, II, 32);
3041 case Intrinsic::aarch64_sme_in_streaming_mode:
3042 return instCombineInStreamingMode(IC, II);
3043 }
3044
3045 return std::nullopt;
3046}
3047
3049 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3050 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3051 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3052 SimplifyAndSetOp) const {
3053 switch (II.getIntrinsicID()) {
3054 default:
3055 break;
3056 case Intrinsic::aarch64_neon_fcvtxn:
3057 case Intrinsic::aarch64_neon_rshrn:
3058 case Intrinsic::aarch64_neon_sqrshrn:
3059 case Intrinsic::aarch64_neon_sqrshrun:
3060 case Intrinsic::aarch64_neon_sqshrn:
3061 case Intrinsic::aarch64_neon_sqshrun:
3062 case Intrinsic::aarch64_neon_sqxtn:
3063 case Intrinsic::aarch64_neon_sqxtun:
3064 case Intrinsic::aarch64_neon_uqrshrn:
3065 case Intrinsic::aarch64_neon_uqshrn:
3066 case Intrinsic::aarch64_neon_uqxtn:
3067 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3068 break;
3069 }
3070
3071 return std::nullopt;
3072}
3073
3075 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3077}
3078
3081 switch (K) {
3083 return TypeSize::getFixed(64);
3085 if (ST->useSVEForFixedLengthVectors() &&
3086 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3087 return TypeSize::getFixed(
3088 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3089 else if (ST->isNeonAvailable())
3090 return TypeSize::getFixed(128);
3091 else
3092 return TypeSize::getFixed(0);
3094 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3096 return TypeSize::getScalable(128);
3097 else
3098 return TypeSize::getScalable(0);
3099 }
3100 llvm_unreachable("Unsupported register kind");
3101}
3102
3103bool AArch64TTIImpl::isSingleExtWideningInstruction(
3104 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3105 Type *SrcOverrideTy) const {
3106 // A helper that returns a vector type from the given type. The number of
3107 // elements in type Ty determines the vector width.
3108 auto toVectorTy = [&](Type *ArgTy) {
3109 return VectorType::get(ArgTy->getScalarType(),
3110 cast<VectorType>(DstTy)->getElementCount());
3111 };
3112
3113 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3114 // i32, i64]. SVE doesn't generally have the same set of instructions to
3115 // perform an extend with the add/sub/mul. There are SMULLB style
3116 // instructions, but they operate on top/bottom, requiring some sort of lane
3117 // interleaving to be used with zext/sext.
3118 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3119 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3120 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3121 return false;
3122
3123 Type *SrcTy = SrcOverrideTy;
3124 switch (Opcode) {
3125 case Instruction::Add: // UADDW(2), SADDW(2).
3126 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3127 // The second operand needs to be an extend
3128 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3129 if (!SrcTy)
3130 SrcTy =
3131 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3132 break;
3133 }
3134
3135 if (Opcode == Instruction::Sub)
3136 return false;
3137
3138 // UADDW(2), SADDW(2) can be commutted.
3139 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3140 if (!SrcTy)
3141 SrcTy =
3142 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3143 break;
3144 }
3145 return false;
3146 }
3147 default:
3148 return false;
3149 }
3150
3151 // Legalize the destination type and ensure it can be used in a widening
3152 // operation.
3153 auto DstTyL = getTypeLegalizationCost(DstTy);
3154 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3155 return false;
3156
3157 // Legalize the source type and ensure it can be used in a widening
3158 // operation.
3159 assert(SrcTy && "Expected some SrcTy");
3160 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3161 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3162 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3163 return false;
3164
3165 // Get the total number of vector elements in the legalized types.
3166 InstructionCost NumDstEls =
3167 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3168 InstructionCost NumSrcEls =
3169 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3170
3171 // Return true if the legalized types have the same number of vector elements
3172 // and the destination element type size is twice that of the source type.
3173 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3174}
3175
3176Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3178 Type *SrcOverrideTy) const {
3179 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3180 Opcode != Instruction::Mul)
3181 return nullptr;
3182
3183 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3184 // i32, i64]. SVE doesn't generally have the same set of instructions to
3185 // perform an extend with the add/sub/mul. There are SMULLB style
3186 // instructions, but they operate on top/bottom, requiring some sort of lane
3187 // interleaving to be used with zext/sext.
3188 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3189 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3190 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3191 return nullptr;
3192
3193 auto getScalarSizeWithOverride = [&](const Value *V) {
3194 if (SrcOverrideTy)
3195 return SrcOverrideTy->getScalarSizeInBits();
3196 return cast<Instruction>(V)
3197 ->getOperand(0)
3198 ->getType()
3199 ->getScalarSizeInBits();
3200 };
3201
3202 unsigned MaxEltSize = 0;
3203 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3204 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3205 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3206 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3207 MaxEltSize = std::max(EltSize0, EltSize1);
3208 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3209 isa<SExtInst, ZExtInst>(Args[1])) {
3210 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3211 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3212 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3213 // enough.
3214 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3215 return nullptr;
3216 MaxEltSize = DstEltSize / 2;
3217 } else if (Opcode == Instruction::Mul &&
3218 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3219 // If one of the operands is a Zext and the other has enough zero bits
3220 // to be treated as unsigned, we can still generate a umull, meaning the
3221 // zext is free.
3222 KnownBits Known =
3223 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3224 if (Args[0]->getType()->getScalarSizeInBits() -
3225 Known.Zero.countLeadingOnes() >
3226 DstTy->getScalarSizeInBits() / 2)
3227 return nullptr;
3228
3229 MaxEltSize =
3230 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3231 } else
3232 return nullptr;
3233
3234 if (MaxEltSize * 2 > DstEltSize)
3235 return nullptr;
3236
3237 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3238 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3239 return nullptr;
3240 return ExtTy;
3241}
3242
3243// s/urhadd instructions implement the following pattern, making the
3244// extends free:
3245// %x = add ((zext i8 -> i16), 1)
3246// %y = (zext i8 -> i16)
3247// trunc i16 (lshr (add %x, %y), 1) -> i8
3248//
3250 Type *Src) const {
3251 // The source should be a legal vector type.
3252 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3253 (Src->isScalableTy() && !ST->hasSVE2()))
3254 return false;
3255
3256 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3257 return false;
3258
3259 // Look for trunc/shl/add before trying to match the pattern.
3260 const Instruction *Add = ExtUser;
3261 auto *AddUser =
3262 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3263 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3264 Add = AddUser;
3265
3266 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3267 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3268 return false;
3269
3270 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3271 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3272 Src->getScalarSizeInBits() !=
3273 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3274 return false;
3275
3276 // Try to match the whole pattern. Ext could be either the first or second
3277 // m_ZExtOrSExt matched.
3278 Instruction *Ex1, *Ex2;
3279 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3280 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3281 return false;
3282
3283 // Ensure both extends are of the same type
3284 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3285 Ex1->getOpcode() == Ex2->getOpcode())
3286 return true;
3287
3288 return false;
3289}
3290
3292 Type *Src,
3295 const Instruction *I) const {
3296 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3297 assert(ISD && "Invalid opcode");
3298 // If the cast is observable, and it is used by a widening instruction (e.g.,
3299 // uaddl, saddw, etc.), it may be free.
3300 if (I && I->hasOneUser()) {
3301 auto *SingleUser = cast<Instruction>(*I->user_begin());
3302 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3303 if (Type *ExtTy = isBinExtWideningInstruction(
3304 SingleUser->getOpcode(), Dst, Operands,
3305 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3306 // The cost from Src->Src*2 needs to be added if required, the cost from
3307 // Src*2->ExtTy is free.
3308 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3309 Type *DoubleSrcTy =
3310 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3311 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3313 }
3314
3315 return 0;
3316 }
3317
3318 if (isSingleExtWideningInstruction(
3319 SingleUser->getOpcode(), Dst, Operands,
3320 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3321 // For adds only count the second operand as free if both operands are
3322 // extends but not the same operation. (i.e both operands are not free in
3323 // add(sext, zext)).
3324 if (SingleUser->getOpcode() == Instruction::Add) {
3325 if (I == SingleUser->getOperand(1) ||
3326 (isa<CastInst>(SingleUser->getOperand(1)) &&
3327 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3328 return 0;
3329 } else {
3330 // Others are free so long as isSingleExtWideningInstruction
3331 // returned true.
3332 return 0;
3333 }
3334 }
3335
3336 // The cast will be free for the s/urhadd instructions
3337 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3338 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3339 return 0;
3340 }
3341
3342 EVT SrcTy = TLI->getValueType(DL, Src);
3343 EVT DstTy = TLI->getValueType(DL, Dst);
3344
3345 if (!SrcTy.isSimple() || !DstTy.isSimple())
3346 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3347
3348 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3349 // we use fcvtx under SVE2. Give them invalid costs.
3350 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3351 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3352 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3354
3355 static const TypeConversionCostTblEntry BF16Tbl[] = {
3356 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3357 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3358 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3359 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3360 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3361 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3362 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3363 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3364 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3365 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3366 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3367 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3368 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3369 };
3370
3371 if (ST->hasBF16())
3372 if (const auto *Entry = ConvertCostTableLookup(
3373 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3374 return Entry->Cost;
3375
3376 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3377 // The cost of unpacking twice is artificially increased for now in order
3378 // to avoid regressions against NEON, which will use tbl instructions directly
3379 // instead of multiple layers of [s|u]unpk[lo|hi].
3380 // We use the unpacks in cases where the destination type is illegal and
3381 // requires splitting of the input, even if the input type itself is legal.
3382 const unsigned int SVE_EXT_COST = 1;
3383 const unsigned int SVE_FCVT_COST = 1;
3384 const unsigned int SVE_UNPACK_ONCE = 4;
3385 const unsigned int SVE_UNPACK_TWICE = 16;
3386
3387 static const TypeConversionCostTblEntry ConversionTbl[] = {
3388 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3389 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3390 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3391 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3392 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3393 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3394 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3395 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3396 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3397 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3398 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3399 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3400 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3401 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3402 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3403 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3404 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3405 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3406 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3407 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3408
3409 // Truncations on nxvmiN
3410 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3411 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3412 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3413 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3414 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3415 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3416 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3417 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3418 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3419 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3420 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3421 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3422 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3423 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3424 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3425 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3426 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3427 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3428 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3429 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3430 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3431 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3432 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3433 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3434 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3435 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3436 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3437 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3438 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3439 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3440 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3441 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3442 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3443
3444 // The number of shll instructions for the extension.
3445 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3446 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3447 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3448 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3449 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3450 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3451 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3452 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3453 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3454 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3455 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3456 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3457 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3458 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3459 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3460 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3461
3462 // FP Ext and trunc
3463 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3464 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3465 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3466 // FP16
3467 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3468 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3469 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3470 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3471 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3472 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3473 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3474 // BF16 (uses shift)
3475 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3476 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3477 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3478 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3479 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3480 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3481 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3482 // FP Ext and trunc
3483 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3484 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3485 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3486 // FP16
3487 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3488 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3489 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3490 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3491 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3492 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3493 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3494 // BF16 (more complex, with +bf16 is handled above)
3495 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3496 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3497 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3498 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3499 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3500 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3501 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3502 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3503
3504 // LowerVectorINT_TO_FP:
3505 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3506 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3507 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3508 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3509 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3510 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3511
3512 // SVE: to nxv2f16
3513 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3514 SVE_EXT_COST + SVE_FCVT_COST},
3515 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3516 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3517 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3518 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3519 SVE_EXT_COST + SVE_FCVT_COST},
3520 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3521 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3522 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3523
3524 // SVE: to nxv4f16
3525 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3526 SVE_EXT_COST + SVE_FCVT_COST},
3527 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3528 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3529 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3530 SVE_EXT_COST + SVE_FCVT_COST},
3531 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3532 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3533
3534 // SVE: to nxv8f16
3535 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3536 SVE_EXT_COST + SVE_FCVT_COST},
3537 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3538 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3539 SVE_EXT_COST + SVE_FCVT_COST},
3540 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3541
3542 // SVE: to nxv16f16
3543 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3544 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3545 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3546 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3547
3548 // Complex: to v2f32
3549 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3550 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3551 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3552 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3553
3554 // SVE: to nxv2f32
3555 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3556 SVE_EXT_COST + SVE_FCVT_COST},
3557 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3558 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3559 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3560 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3561 SVE_EXT_COST + SVE_FCVT_COST},
3562 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3563 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3564 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3565
3566 // Complex: to v4f32
3567 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3568 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3569 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3570 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3571
3572 // SVE: to nxv4f32
3573 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3574 SVE_EXT_COST + SVE_FCVT_COST},
3575 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3576 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3577 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3578 SVE_EXT_COST + SVE_FCVT_COST},
3579 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3580 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3581
3582 // Complex: to v8f32
3583 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3584 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3585 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3586 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3587
3588 // SVE: to nxv8f32
3589 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3590 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3591 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3592 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3593 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3594 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3595 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3596 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3597
3598 // SVE: to nxv16f32
3599 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3600 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3601 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3602 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3603
3604 // Complex: to v16f32
3605 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3606 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3607
3608 // Complex: to v2f64
3609 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3610 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3611 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3612 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3613 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3614 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3615
3616 // SVE: to nxv2f64
3617 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3618 SVE_EXT_COST + SVE_FCVT_COST},
3619 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3620 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3621 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3622 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3623 SVE_EXT_COST + SVE_FCVT_COST},
3624 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3625 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3626 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3627
3628 // Complex: to v4f64
3629 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3630 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3631
3632 // SVE: to nxv4f64
3633 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3634 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3635 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3636 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3637 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3638 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3639 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3640 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3641 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3642 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3643 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3644 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3645
3646 // SVE: to nxv8f64
3647 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3648 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3649 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3650 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3651 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3652 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3653 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3654 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3655
3656 // LowerVectorFP_TO_INT
3657 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3658 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3659 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3660 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3661 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3662 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3663
3664 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3665 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3666 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3667 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3668 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3669 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3670 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3671
3672 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3673 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3674 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3675 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3676 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3677
3678 // Complex, from nxv2f32.
3679 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3680 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3681 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3682 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3683 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3684 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3685 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3686 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3687
3688 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3689 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3690 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3691 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3692 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3693 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3694 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3695
3696 // Complex, from nxv2f64.
3697 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3698 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3699 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3700 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3701 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3702 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3703 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3704 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3705 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3706 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3707
3708 // Complex, from nxv4f32.
3709 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3710 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3711 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3712 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3713 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3714 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3715 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3716 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3717 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3718 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3719
3720 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3721 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3722 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3723 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3724 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3725
3726 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3727 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3728 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3729 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3730 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3731 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3732 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3733
3734 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3735 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3736 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3737 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3738 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3739
3740 // Complex, from nxv8f16.
3741 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3742 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3743 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3744 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3745 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3746 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3747 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3748 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3749 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3750 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3751
3752 // Complex, from nxv4f16.
3753 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3754 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3755 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3756 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3757 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3758 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3759 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3760 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3761
3762 // Complex, from nxv2f16.
3763 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3764 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3765 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3766 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3767 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3768 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3769 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3770 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3771
3772 // Truncate from nxvmf32 to nxvmf16.
3773 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3774 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3775 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3776
3777 // Truncate from nxvmf32 to nxvmbf16.
3778 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3779 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3780 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3781
3782 // Truncate from nxvmf64 to nxvmf16.
3783 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3784 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3785 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3786
3787 // Truncate from nxvmf64 to nxvmbf16.
3788 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3789 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3790 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3791
3792 // Truncate from nxvmf64 to nxvmf32.
3793 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3794 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3795 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3796
3797 // Extend from nxvmf16 to nxvmf32.
3798 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3799 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3800 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3801
3802 // Extend from nxvmbf16 to nxvmf32.
3803 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3804 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3805 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3806
3807 // Extend from nxvmf16 to nxvmf64.
3808 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3809 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3810 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3811
3812 // Extend from nxvmbf16 to nxvmf64.
3813 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3814 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3815 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3816
3817 // Extend from nxvmf32 to nxvmf64.
3818 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3819 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3820 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3821
3822 // Bitcasts from float to integer
3823 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3824 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3825 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3826
3827 // Bitcasts from integer to float
3828 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3829 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3830 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3831
3832 // Add cost for extending to illegal -too wide- scalable vectors.
3833 // zero/sign extend are implemented by multiple unpack operations,
3834 // where each operation has a cost of 1.
3835 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3836 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3837 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3838 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3839 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3840 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3841
3842 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3843 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3844 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3845 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3846 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3847 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3848 };
3849
3850 // We have to estimate a cost of fixed length operation upon
3851 // SVE registers(operations) with the number of registers required
3852 // for a fixed type to be represented upon SVE registers.
3853 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3854 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3855 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3856 ST->useSVEForFixedLengthVectors(WiderTy)) {
3857 std::pair<InstructionCost, MVT> LT =
3858 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3859 unsigned NumElements =
3860 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3861 return LT.first *
3863 Opcode,
3864 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3865 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3866 CostKind, I);
3867 }
3868
3869 if (const auto *Entry = ConvertCostTableLookup(
3870 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3871 return Entry->Cost;
3872
3873 static const TypeConversionCostTblEntry FP16Tbl[] = {
3874 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3875 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3876 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3877 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3878 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3879 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3880 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3881 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3882 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3883 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3884 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3885 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3886 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3887 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3888 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3889 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3890 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3891 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3892 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3893 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3894 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3895 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3896 };
3897
3898 if (ST->hasFullFP16())
3899 if (const auto *Entry = ConvertCostTableLookup(
3900 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3901 return Entry->Cost;
3902
3903 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3904 // double-rounding issues.
3905 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3906 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3908 return cast<FixedVectorType>(Dst)->getNumElements() *
3909 getCastInstrCost(Opcode, Dst->getScalarType(),
3910 Src->getScalarType(), CCH, CostKind) +
3912 true, CostKind) +
3914 false, CostKind);
3915
3916 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3918 ST->isSVEorStreamingSVEAvailable() &&
3919 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3921 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3923 // The standard behaviour in the backend for these cases is to split the
3924 // extend up into two parts:
3925 // 1. Perform an extending load or masked load up to the legal type.
3926 // 2. Extend the loaded data to the final type.
3927 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3928 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3930 Opcode, LegalTy, Src, CCH, CostKind, I);
3932 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3933 return Part1 + Part2;
3934 }
3935
3936 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3937 // but we also want to include the TTI::CastContextHint::Masked case too.
3938 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3940 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3942
3943 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3944}
3945
3948 VectorType *VecTy, unsigned Index,
3950
3951 // Make sure we were given a valid extend opcode.
3952 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3953 "Invalid opcode");
3954
3955 // We are extending an element we extract from a vector, so the source type
3956 // of the extend is the element type of the vector.
3957 auto *Src = VecTy->getElementType();
3958
3959 // Sign- and zero-extends are for integer types only.
3960 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3961
3962 // Get the cost for the extract. We compute the cost (if any) for the extend
3963 // below.
3964 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3965 CostKind, Index, nullptr, nullptr);
3966
3967 // Legalize the types.
3968 auto VecLT = getTypeLegalizationCost(VecTy);
3969 auto DstVT = TLI->getValueType(DL, Dst);
3970 auto SrcVT = TLI->getValueType(DL, Src);
3971
3972 // If the resulting type is still a vector and the destination type is legal,
3973 // we may get the extension for free. If not, get the default cost for the
3974 // extend.
3975 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3976 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3977 CostKind);
3978
3979 // The destination type should be larger than the element type. If not, get
3980 // the default cost for the extend.
3981 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3982 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3983 CostKind);
3984
3985 switch (Opcode) {
3986 default:
3987 llvm_unreachable("Opcode should be either SExt or ZExt");
3988
3989 // For sign-extends, we only need a smov, which performs the extension
3990 // automatically.
3991 case Instruction::SExt:
3992 return Cost;
3993
3994 // For zero-extends, the extend is performed automatically by a umov unless
3995 // the destination type is i64 and the element type is i8 or i16.
3996 case Instruction::ZExt:
3997 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3998 return Cost;
3999 }
4000
4001 // If we are unable to perform the extend for free, get the default cost.
4002 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4003 CostKind);
4004}
4005
4008 const Instruction *I) const {
4010 return Opcode == Instruction::PHI ? 0 : 1;
4011 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4012 // Branches are assumed to be predicted.
4013 return 0;
4014}
4015
4016InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4017 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4018 const Instruction *I, Value *Scalar,
4019 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4020 TTI::VectorInstrContext VIC) const {
4021 assert(Val->isVectorTy() && "This must be a vector type");
4022
4023 if (Index != -1U) {
4024 // Legalize the type.
4025 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4026
4027 // This type is legalized to a scalar type.
4028 if (!LT.second.isVector())
4029 return 0;
4030
4031 // The type may be split. For fixed-width vectors we can normalize the
4032 // index to the new type.
4033 if (LT.second.isFixedLengthVector()) {
4034 unsigned Width = LT.second.getVectorNumElements();
4035 Index = Index % Width;
4036 }
4037
4038 // The element at index zero is already inside the vector.
4039 // - For a insert-element or extract-element
4040 // instruction that extracts integers, an explicit FPR -> GPR move is
4041 // needed. So it has non-zero cost.
4042 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4043 return 0;
4044
4045 // This is recognising a LD1 single-element structure to one lane of one
4046 // register instruction. I.e., if this is an `insertelement` instruction,
4047 // and its second operand is a load, then we will generate a LD1, which
4048 // are expensive instructions on some uArchs.
4049 if (VIC == TTI::VectorInstrContext::Load) {
4050 if (ST->hasFastLD1Single())
4051 return 0;
4052 return CostKind == TTI::TCK_CodeSize
4053 ? 0
4055 }
4056
4057 // i1 inserts and extract will include an extra cset or cmp of the vector
4058 // value. Increase the cost by 1 to account.
4059 if (Val->getScalarSizeInBits() == 1)
4060 return CostKind == TTI::TCK_CodeSize
4061 ? 2
4062 : ST->getVectorInsertExtractBaseCost() + 1;
4063
4064 // FIXME:
4065 // If the extract-element and insert-element instructions could be
4066 // simplified away (e.g., could be combined into users by looking at use-def
4067 // context), they have no cost. This is not done in the first place for
4068 // compile-time considerations.
4069 }
4070
4071 // In case of Neon, if there exists extractelement from lane != 0 such that
4072 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4073 // 2. extractelement result feeds into fmul.
4074 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4075 // equivalent to 0.
4076 // then the extractelement can be merged with fmul in the backend and it
4077 // incurs no cost.
4078 // e.g.
4079 // define double @foo(<2 x double> %a) {
4080 // %1 = extractelement <2 x double> %a, i32 0
4081 // %2 = extractelement <2 x double> %a, i32 1
4082 // %res = fmul double %1, %2
4083 // ret double %res
4084 // }
4085 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4086 auto ExtractCanFuseWithFmul = [&]() {
4087 // We bail out if the extract is from lane 0.
4088 if (Index == 0)
4089 return false;
4090
4091 // Check if the scalar element type of the vector operand of ExtractElement
4092 // instruction is one of the allowed types.
4093 auto IsAllowedScalarTy = [&](const Type *T) {
4094 return T->isFloatTy() || T->isDoubleTy() ||
4095 (T->isHalfTy() && ST->hasFullFP16());
4096 };
4097
4098 // Check if the extractelement user is scalar fmul.
4099 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4100 // Check if the user is scalar fmul.
4101 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4102 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4103 !BO->getType()->isVectorTy();
4104 };
4105
4106 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4107 // certain scalar type and a certain vector register width.
4108 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4109 auto RegWidth =
4111 .getFixedValue();
4112 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4113 };
4114
4115 // Check if the type constraints on input vector type and result scalar type
4116 // of extractelement instruction are satisfied.
4117 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4118 return false;
4119
4120 if (Scalar) {
4121 DenseMap<User *, unsigned> UserToExtractIdx;
4122 for (auto *U : Scalar->users()) {
4123 if (!IsUserFMulScalarTy(U))
4124 return false;
4125 // Recording entry for the user is important. Index value is not
4126 // important.
4127 UserToExtractIdx[U];
4128 }
4129 if (UserToExtractIdx.empty())
4130 return false;
4131 for (auto &[S, U, L] : ScalarUserAndIdx) {
4132 for (auto *U : S->users()) {
4133 if (UserToExtractIdx.contains(U)) {
4134 auto *FMul = cast<BinaryOperator>(U);
4135 auto *Op0 = FMul->getOperand(0);
4136 auto *Op1 = FMul->getOperand(1);
4137 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4138 UserToExtractIdx[U] = L;
4139 break;
4140 }
4141 }
4142 }
4143 }
4144 for (auto &[U, L] : UserToExtractIdx) {
4145 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4146 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4147 return false;
4148 }
4149 } else {
4150 const auto *EE = cast<ExtractElementInst>(I);
4151
4152 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4153 if (!IdxOp)
4154 return false;
4155
4156 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4157 if (!IsUserFMulScalarTy(U))
4158 return false;
4159
4160 // Check if the other operand of extractelement is also extractelement
4161 // from lane equivalent to 0.
4162 const auto *BO = cast<BinaryOperator>(U);
4163 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4164 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4165 if (OtherEE) {
4166 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4167 if (!IdxOp)
4168 return false;
4169 return IsExtractLaneEquivalentToZero(
4170 cast<ConstantInt>(OtherEE->getIndexOperand())
4171 ->getValue()
4172 .getZExtValue(),
4173 OtherEE->getType()->getScalarSizeInBits());
4174 }
4175 return true;
4176 });
4177 }
4178 return true;
4179 };
4180
4181 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4182 ExtractCanFuseWithFmul())
4183 return 0;
4184
4185 // All other insert/extracts cost this much.
4186 return CostKind == TTI::TCK_CodeSize ? 1
4187 : ST->getVectorInsertExtractBaseCost();
4188}
4189
4191 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4192 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4193 // Treat insert at lane 0 into a poison vector as having zero cost. This
4194 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4195 // single dup) are treated as cheap.
4196 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4197 isa<PoisonValue>(Op0))
4198 return 0;
4199 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4200 nullptr, {}, VIC);
4201}
4202
4204 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4205 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4206 TTI::VectorInstrContext VIC) const {
4207 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4208 ScalarUserAndIdx, VIC);
4209}
4210
4213 TTI::TargetCostKind CostKind, unsigned Index,
4214 TTI::VectorInstrContext VIC) const {
4215 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4216 nullptr, {}, VIC);
4217}
4218
4222 unsigned Index) const {
4223 if (isa<FixedVectorType>(Val))
4225 Index);
4226
4227 // This typically requires both while and lastb instructions in order
4228 // to extract the last element. If this is in a loop the while
4229 // instruction can at least be hoisted out, although it will consume a
4230 // predicate register. The cost should be more expensive than the base
4231 // extract cost, which is 2 for most CPUs.
4232 return CostKind == TTI::TCK_CodeSize
4233 ? 2
4234 : ST->getVectorInsertExtractBaseCost() + 1;
4235}
4236
4238 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4239 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4240 TTI::VectorInstrContext VIC) const {
4243 if (Ty->getElementType()->isFloatingPointTy())
4244 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4245 CostKind);
4246 unsigned VecInstCost =
4247 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4248 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4249}
4250
4251std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4253 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4254 std::function<InstructionCost(Type *)> InstCost) const {
4255 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4256 return std::nullopt;
4257 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4258 return std::nullopt;
4259 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4260 ST->isNonStreamingSVEorSME2Available())
4261 return std::nullopt;
4262
4263 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4264 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4266 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4267 Cost *= 2;
4268 Cost += InstCost(PromotedTy);
4269 if (IncludeTrunc)
4270 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4272 return Cost;
4273}
4274
4276 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4278 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4279
4280 // The code-generator is currently not able to handle scalable vectors
4281 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4282 // it. This change will be removed when code-generation for these types is
4283 // sufficiently reliable.
4284 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4285 if (VTy->getElementCount() == ElementCount::getScalable(1))
4287
4288 // TODO: Handle more cost kinds.
4290 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4291 Op2Info, Args, CxtI);
4292
4293 // Legalize the type.
4294 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4295 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4296
4297 // Increase the cost for half and bfloat types if not architecturally
4298 // supported.
4299 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4300 ISD == ISD::FDIV || ISD == ISD::FREM)
4301 if (auto PromotedCost = getFP16BF16PromoteCost(
4302 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4303 // There is not native support for fdiv/frem even with +sve-b16b16.
4304 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4305 [&](Type *PromotedTy) {
4306 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4307 Op1Info, Op2Info);
4308 }))
4309 return *PromotedCost;
4310
4311 // If the operation is a widening instruction (smull or umull) and both
4312 // operands are extends the cost can be cheaper by considering that the
4313 // operation will operate on the narrowest type size possible (double the
4314 // largest input size) and a further extend.
4315 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4316 if (ExtTy != Ty)
4317 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4318 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4320 return LT.first;
4321 }
4322
4323 switch (ISD) {
4324 default:
4325 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4326 Op2Info);
4327 case ISD::SREM:
4328 case ISD::SDIV:
4329 /*
4330 Notes for sdiv/srem specific costs:
4331 1. This only considers the cases where the divisor is constant, uniform and
4332 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4333 result in some form of (ldr + adrp), corresponding to constant vectors, or
4334 scalarization of the division operation.
4335 2. Constant divisors, either negative in whole or partially, don't result in
4336 significantly different codegen as compared to positive constant divisors.
4337 So, we don't consider negative divisors separately.
4338 3. If the codegen is significantly different with SVE, it has been indicated
4339 using comments at appropriate places.
4340
4341 sdiv specific cases:
4342 -----------------------------------------------------------------------
4343 codegen | pow-of-2 | Type
4344 -----------------------------------------------------------------------
4345 add + cmp + csel + asr | Y | i64
4346 add + cmp + csel + asr | Y | i32
4347 -----------------------------------------------------------------------
4348
4349 srem specific cases:
4350 -----------------------------------------------------------------------
4351 codegen | pow-of-2 | Type
4352 -----------------------------------------------------------------------
4353 negs + and + and + csneg | Y | i64
4354 negs + and + and + csneg | Y | i32
4355 -----------------------------------------------------------------------
4356
4357 other sdiv/srem cases:
4358 -------------------------------------------------------------------------
4359 common codegen | + srem | + sdiv | pow-of-2 | Type
4360 -------------------------------------------------------------------------
4361 smulh + asr + add + add | - | - | N | i64
4362 smull + lsr + add + add | - | - | N | i32
4363 usra | and + sub | sshr | Y | <2 x i64>
4364 2 * (scalar code) | - | - | N | <2 x i64>
4365 usra | bic + sub | sshr + neg | Y | <4 x i32>
4366 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4367 + sshr + usra | | | |
4368 -------------------------------------------------------------------------
4369 */
4370 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4371 InstructionCost AddCost =
4372 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4373 Op1Info.getNoProps(), Op2Info.getNoProps());
4374 InstructionCost AsrCost =
4375 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4376 Op1Info.getNoProps(), Op2Info.getNoProps());
4377 InstructionCost MulCost =
4378 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4379 Op1Info.getNoProps(), Op2Info.getNoProps());
4380 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4381 // have similar cost.
4382 auto VT = TLI->getValueType(DL, Ty);
4383 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4384 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4385 // Neg can be folded into the asr instruction.
4386 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4387 : (3 * AsrCost + AddCost);
4388 } else {
4389 return MulCost + AsrCost + 2 * AddCost;
4390 }
4391 } else if (VT.isVector()) {
4392 InstructionCost UsraCost = 2 * AsrCost;
4393 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4394 // Division with scalable types corresponds to native 'asrd'
4395 // instruction when SVE is available.
4396 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4397
4398 // One more for the negation in SDIV
4400 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4401 if (Ty->isScalableTy() && ST->hasSVE())
4402 Cost += 2 * AsrCost;
4403 else {
4404 Cost +=
4405 UsraCost +
4406 (ISD == ISD::SDIV
4407 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4408 : 2 * AddCost);
4409 }
4410 return Cost;
4411 } else if (LT.second == MVT::v2i64) {
4412 return VT.getVectorNumElements() *
4413 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4414 Op1Info.getNoProps(),
4415 Op2Info.getNoProps());
4416 } else {
4417 // When SVE is available, we get:
4418 // smulh + lsr + add/sub + asr + add/sub.
4419 if (Ty->isScalableTy() && ST->hasSVE())
4420 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4421 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4422 }
4423 }
4424 }
4425 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4426 LT.second.isFixedLengthVector()) {
4427 // FIXME: When the constant vector is non-uniform, this may result in
4428 // loading the vector from constant pool or in some cases, may also result
4429 // in scalarization. For now, we are approximating this with the
4430 // scalarization cost.
4431 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4432 CostKind, -1, nullptr, nullptr);
4433 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4434 CostKind, -1, nullptr, nullptr);
4435 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4436 return ExtractCost + InsertCost +
4437 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4438 CostKind, Op1Info.getNoProps(),
4439 Op2Info.getNoProps());
4440 }
4441 [[fallthrough]];
4442 case ISD::UDIV:
4443 case ISD::UREM: {
4444 auto VT = TLI->getValueType(DL, Ty);
4445 if (Op2Info.isConstant()) {
4446 // If the operand is a power of 2 we can use the shift or and cost.
4447 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4448 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4449 Op1Info.getNoProps(),
4450 Op2Info.getNoProps());
4451 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4452 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4453 Op1Info.getNoProps(),
4454 Op2Info.getNoProps());
4455
4456 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4457 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4458 // The MULHU will be expanded to UMULL for the types not listed below,
4459 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4460 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4461 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4462 LT.second == MVT::nxv16i8;
4463 bool Is128bit = LT.second.is128BitVector();
4464
4465 InstructionCost MulCost =
4466 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4467 Op1Info.getNoProps(), Op2Info.getNoProps());
4468 InstructionCost AddCost =
4469 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4470 Op1Info.getNoProps(), Op2Info.getNoProps());
4471 InstructionCost ShrCost =
4472 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4473 Op1Info.getNoProps(), Op2Info.getNoProps());
4474 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4475 (HasMULH ? 0 : ShrCost) + // UMULL shift
4476 AddCost * 2 + ShrCost;
4477 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4478 }
4479 }
4480
4481 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4482 // emitted by the backend even when those functions are not declared in the
4483 // module.
4484 if (!VT.isVector() && VT.getSizeInBits() > 64)
4485 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4486
4488 Opcode, Ty, CostKind, Op1Info, Op2Info);
4489 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4490 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4491 // SDIV/UDIV operations are lowered using SVE, then we can have less
4492 // costs.
4493 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4494 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4495 static const CostTblEntry DivTbl[]{
4496 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4497 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4498 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4499 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4500 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4501 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4502
4503 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4504 if (nullptr != Entry)
4505 return Entry->Cost;
4506 }
4507 // For 8/16-bit elements, the cost is higher because the type
4508 // requires promotion and possibly splitting:
4509 if (LT.second.getScalarType() == MVT::i8)
4510 Cost *= 8;
4511 else if (LT.second.getScalarType() == MVT::i16)
4512 Cost *= 4;
4513 return Cost;
4514 } else {
4515 // If one of the operands is a uniform constant then the cost for each
4516 // element is Cost for insertion, extraction and division.
4517 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4518 // operation with scalar type
4519 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4520 (Op2Info.isConstant() && Op2Info.isUniform())) {
4521 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4523 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4524 return (4 + DivCost) * VTy->getNumElements();
4525 }
4526 }
4527 // On AArch64, without SVE, vector divisions are expanded
4528 // into scalar divisions of each pair of elements.
4529 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4530 -1, nullptr, nullptr);
4531 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4532 nullptr, nullptr);
4533 }
4534
4535 // TODO: if one of the arguments is scalar, then it's not necessary to
4536 // double the cost of handling the vector elements.
4537 Cost += Cost;
4538 }
4539 return Cost;
4540 }
4541 case ISD::MUL:
4542 // When SVE is available, then we can lower the v2i64 operation using
4543 // the SVE mul instruction, which has a lower cost.
4544 if (LT.second == MVT::v2i64 && ST->hasSVE())
4545 return LT.first;
4546
4547 // When SVE is not available, there is no MUL.2d instruction,
4548 // which means mul <2 x i64> is expensive as elements are extracted
4549 // from the vectors and the muls scalarized.
4550 // As getScalarizationOverhead is a bit too pessimistic, we
4551 // estimate the cost for a i64 vector directly here, which is:
4552 // - four 2-cost i64 extracts,
4553 // - two 2-cost i64 inserts, and
4554 // - two 1-cost muls.
4555 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4556 // LT.first = 2 the cost is 28.
4557 if (LT.second != MVT::v2i64)
4558 return LT.first;
4559 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4560 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4561 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4562 nullptr, nullptr) *
4563 2 +
4564 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4565 nullptr, nullptr));
4566 case ISD::ADD:
4567 case ISD::XOR:
4568 case ISD::OR:
4569 case ISD::AND:
4570 case ISD::SRL:
4571 case ISD::SRA:
4572 case ISD::SHL:
4573 // These nodes are marked as 'custom' for combining purposes only.
4574 // We know that they are legal. See LowerAdd in ISelLowering.
4575 return LT.first;
4576
4577 case ISD::FNEG:
4578 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4579 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4580 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4581 CxtI &&
4582 ((CxtI->hasOneUse() &&
4583 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4584 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4585 return 0;
4586 [[fallthrough]];
4587 case ISD::FADD:
4588 case ISD::FSUB:
4589 if (!Ty->getScalarType()->isFP128Ty())
4590 return LT.first;
4591 [[fallthrough]];
4592 case ISD::FMUL:
4593 case ISD::FDIV:
4594 // These nodes are marked as 'custom' just to lower them to SVE.
4595 // We know said lowering will incur no additional cost.
4596 if (!Ty->getScalarType()->isFP128Ty())
4597 return 2 * LT.first;
4598
4599 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4600 Op2Info);
4601 case ISD::FREM:
4602 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4603 // those functions are not declared in the module.
4604 if (!Ty->isVectorTy())
4605 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4606 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4607 Op2Info);
4608 }
4609}
4610
4613 const SCEV *Ptr,
4615 // Address computations in vectorized code with non-consecutive addresses will
4616 // likely result in more instructions compared to scalar code where the
4617 // computation can more often be merged into the index mode. The resulting
4618 // extra micro-ops can significantly decrease throughput.
4619 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4620 int MaxMergeDistance = 64;
4621
4622 if (PtrTy->isVectorTy() && SE &&
4623 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4624 return NumVectorInstToHideOverhead;
4625
4626 // In many cases the address computation is not merged into the instruction
4627 // addressing mode.
4628 return 1;
4629}
4630
4631/// Check whether Opcode1 has less throughput according to the scheduling
4632/// model than Opcode2.
4634 unsigned Opcode1, unsigned Opcode2) const {
4635 const MCSchedModel &Sched = ST->getSchedModel();
4636 const TargetInstrInfo *TII = ST->getInstrInfo();
4637 if (!Sched.hasInstrSchedModel())
4638 return false;
4639
4640 const MCSchedClassDesc *SCD1 =
4641 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4642 const MCSchedClassDesc *SCD2 =
4643 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4644 // We cannot handle variant scheduling classes without an MI. If we need to
4645 // support them for any of the instructions we query the information of we
4646 // might need to add a way to resolve them without a MI or not use the
4647 // scheduling info.
4648 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4649 "Cannot handle variant scheduling classes without an MI");
4650 if (!SCD1->isValid() || !SCD2->isValid())
4651 return false;
4652
4653 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4655}
4656
4658 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4660 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4661 // We don't lower some vector selects well that are wider than the register
4662 // width. TODO: Improve this with different cost kinds.
4663 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4664 // We would need this many instructions to hide the scalarization happening.
4665 const int AmortizationCost = 20;
4666
4667 // If VecPred is not set, check if we can get a predicate from the context
4668 // instruction, if its type matches the requested ValTy.
4669 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4670 CmpPredicate CurrentPred;
4671 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4672 m_Value())))
4673 VecPred = CurrentPred;
4674 }
4675 // Check if we have a compare/select chain that can be lowered using
4676 // a (F)CMxx & BFI pair.
4677 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4678 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4679 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4680 VecPred == CmpInst::FCMP_UNE) {
4681 static const auto ValidMinMaxTys = {
4682 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4683 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4684 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4685
4686 auto LT = getTypeLegalizationCost(ValTy);
4687 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4688 (ST->hasFullFP16() &&
4689 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4690 return LT.first;
4691 }
4692
4693 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4694 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4695 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4696 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4697 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4698 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4699 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4700 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4701 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4702 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4703 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4704 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4705
4706 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4707 EVT SelValTy = TLI->getValueType(DL, ValTy);
4708 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4709 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4710 SelCondTy.getSimpleVT(),
4711 SelValTy.getSimpleVT()))
4712 return Entry->Cost;
4713 }
4714 }
4715
4716 if (Opcode == Instruction::FCmp) {
4717 if (auto PromotedCost = getFP16BF16PromoteCost(
4718 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4719 // TODO: Consider costing SVE FCMPs.
4720 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4722 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4723 CostKind, Op1Info, Op2Info);
4724 if (isa<VectorType>(PromotedTy))
4726 Instruction::Trunc,
4730 return Cost;
4731 }))
4732 return *PromotedCost;
4733
4734 auto LT = getTypeLegalizationCost(ValTy);
4735 // Model unknown fp compares as a libcall.
4736 if (LT.second.getScalarType() != MVT::f64 &&
4737 LT.second.getScalarType() != MVT::f32 &&
4738 LT.second.getScalarType() != MVT::f16)
4739 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4740 {ValTy, ValTy}, CostKind);
4741
4742 // Some comparison operators require expanding to multiple compares + or.
4743 unsigned Factor = 1;
4744 if (!CondTy->isVectorTy() &&
4745 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4746 Factor = 2; // fcmp with 2 selects
4747 else if (isa<FixedVectorType>(ValTy) &&
4748 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4749 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4750 Factor = 3; // fcmxx+fcmyy+or
4751 else if (isa<ScalableVectorType>(ValTy) &&
4752 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4753 Factor = 3; // fcmxx+fcmyy+or
4754
4755 if (isa<ScalableVectorType>(ValTy) &&
4757 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4758 AArch64::FCMEQv4f32))
4759 Factor *= 2;
4760
4761 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4762 }
4763
4764 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4765 // icmp(and, 0) as free, as we can make use of ands, but only if the
4766 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4767 // providing it will not cause performance regressions.
4768 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4769 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4770 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4771 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4772 if (match(I->getOperand(1), m_Zero()))
4773 return 0;
4774
4775 // x >= 1 / x < 1 -> x > 0 / x <= 0
4776 if (match(I->getOperand(1), m_One()) &&
4777 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4778 return 0;
4779
4780 // x <= -1 / x > -1 -> x > 0 / x <= 0
4781 if (match(I->getOperand(1), m_AllOnes()) &&
4782 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4783 return 0;
4784 }
4785
4786 // The base case handles scalable vectors fine for now, since it treats the
4787 // cost as 1 * legalization cost.
4788 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4789 Op1Info, Op2Info, I);
4790}
4791
4793AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4795 if (ST->requiresStrictAlign()) {
4796 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4797 // a bunch of instructions when strict align is enabled.
4798 return Options;
4799 }
4800 Options.AllowOverlappingLoads = true;
4801 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4802 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4803 // TODO: Though vector loads usually perform well on AArch64, in some targets
4804 // they may wake up the FP unit, which raises the power consumption. Perhaps
4805 // they could be used with no holds barred (-O3).
4806 Options.LoadSizes = {8, 4, 2, 1};
4807 Options.AllowedTailExpansions = {3, 5, 6};
4808 return Options;
4809}
4810
4812 return ST->hasSVE();
4813}
4814
4818 switch (MICA.getID()) {
4819 case Intrinsic::masked_scatter:
4820 case Intrinsic::masked_gather:
4821 return getGatherScatterOpCost(MICA, CostKind);
4822 case Intrinsic::masked_load:
4823 case Intrinsic::masked_store:
4824 return getMaskedMemoryOpCost(MICA, CostKind);
4825 }
4827}
4828
4832 Type *Src = MICA.getDataType();
4833
4834 if (useNeonVector(Src))
4836 auto LT = getTypeLegalizationCost(Src);
4837 if (!LT.first.isValid())
4839
4840 // Return an invalid cost for element types that we are unable to lower.
4841 auto *VT = cast<VectorType>(Src);
4842 if (VT->getElementType()->isIntegerTy(1))
4844
4845 // The code-generator is currently not able to handle scalable vectors
4846 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4847 // it. This change will be removed when code-generation for these types is
4848 // sufficiently reliable.
4849 if (VT->getElementCount() == ElementCount::getScalable(1))
4851
4852 return LT.first;
4853}
4854
4855// This function returns gather/scatter overhead either from
4856// user-provided value or specialized values per-target from \p ST.
4857static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4858 const AArch64Subtarget *ST) {
4859 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4860 "Should be called on only load or stores.");
4861 switch (Opcode) {
4862 case Instruction::Load:
4863 if (SVEGatherOverhead.getNumOccurrences() > 0)
4864 return SVEGatherOverhead;
4865 return ST->getGatherOverhead();
4866 break;
4867 case Instruction::Store:
4868 if (SVEScatterOverhead.getNumOccurrences() > 0)
4869 return SVEScatterOverhead;
4870 return ST->getScatterOverhead();
4871 break;
4872 default:
4873 llvm_unreachable("Shouldn't have reached here");
4874 }
4875}
4876
4880
4881 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4882 MICA.getID() == Intrinsic::vp_gather)
4883 ? Instruction::Load
4884 : Instruction::Store;
4885
4886 Type *DataTy = MICA.getDataType();
4887 Align Alignment = MICA.getAlignment();
4888 const Instruction *I = MICA.getInst();
4889
4890 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4892 auto *VT = cast<VectorType>(DataTy);
4893 auto LT = getTypeLegalizationCost(DataTy);
4894 if (!LT.first.isValid())
4896
4897 // Return an invalid cost for element types that we are unable to lower.
4898 if (!LT.second.isVector() ||
4899 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4900 VT->getElementType()->isIntegerTy(1))
4902
4903 // The code-generator is currently not able to handle scalable vectors
4904 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4905 // it. This change will be removed when code-generation for these types is
4906 // sufficiently reliable.
4907 if (VT->getElementCount() == ElementCount::getScalable(1))
4909
4910 ElementCount LegalVF = LT.second.getVectorElementCount();
4911 InstructionCost MemOpCost =
4912 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4913 {TTI::OK_AnyValue, TTI::OP_None}, I);
4914 // Add on an overhead cost for using gathers/scatters.
4915 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4916 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4917}
4918
4920 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4921}
4922
4924 Align Alignment,
4925 unsigned AddressSpace,
4927 TTI::OperandValueInfo OpInfo,
4928 const Instruction *I) const {
4929 EVT VT = TLI->getValueType(DL, Ty, true);
4930 // Type legalization can't handle structs
4931 if (VT == MVT::Other)
4932 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4933 CostKind);
4934
4935 auto LT = getTypeLegalizationCost(Ty);
4936 if (!LT.first.isValid())
4938
4939 // The code-generator is currently not able to handle scalable vectors
4940 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4941 // it. This change will be removed when code-generation for these types is
4942 // sufficiently reliable.
4943 // We also only support full register predicate loads and stores.
4944 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4945 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4946 (VTy->getElementType()->isIntegerTy(1) &&
4947 !VTy->getElementCount().isKnownMultipleOf(
4950
4951 // TODO: consider latency as well for TCK_SizeAndLatency.
4953 return LT.first;
4954
4956 return 1;
4957
4958 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4959 LT.second.is128BitVector() && Alignment < Align(16)) {
4960 // Unaligned stores are extremely inefficient. We don't split all
4961 // unaligned 128-bit stores because the negative impact that has shown in
4962 // practice on inlined block copy code.
4963 // We make such stores expensive so that we will only vectorize if there
4964 // are 6 other instructions getting vectorized.
4965 const int AmortizationCost = 6;
4966
4967 return LT.first * 2 * AmortizationCost;
4968 }
4969
4970 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4971 if (Ty->isPtrOrPtrVectorTy())
4972 return LT.first;
4973
4974 if (useNeonVector(Ty)) {
4975 // Check truncating stores and extending loads.
4976 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4977 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4978 if (VT == MVT::v4i8)
4979 return 2;
4980 // Otherwise we need to scalarize.
4981 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4982 }
4983 EVT EltVT = VT.getVectorElementType();
4984 unsigned EltSize = EltVT.getScalarSizeInBits();
4985 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4986 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4987 return LT.first;
4988 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4989 // widening to v4i8, which produces suboptimal results.
4990 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4991 return LT.first;
4992
4993 // Check non-power-of-2 loads/stores for legal vector element types with
4994 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4995 // operations on smaller power-of-2 ops, including ld1/st1.
4996 LLVMContext &C = Ty->getContext();
4998 SmallVector<EVT> TypeWorklist;
4999 TypeWorklist.push_back(VT);
5000 while (!TypeWorklist.empty()) {
5001 EVT CurrVT = TypeWorklist.pop_back_val();
5002 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5003 if (isPowerOf2_32(CurrNumElements)) {
5004 Cost += 1;
5005 continue;
5006 }
5007
5008 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5009 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5010 TypeWorklist.push_back(
5011 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5012 }
5013 return Cost;
5014 }
5015
5016 return LT.first;
5017}
5018
5020 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5021 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5022 bool UseMaskForCond, bool UseMaskForGaps) const {
5023 assert(Factor >= 2 && "Invalid interleave factor");
5024 auto *VecVTy = cast<VectorType>(VecTy);
5025
5026 if (VecTy->isScalableTy() && !ST->hasSVE())
5028
5029 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5030 // only have lowering for power-of-2 factors.
5031 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5032 // InterleavedAccessPass for ld3/st3
5033 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5035
5036 // Vectorization for masked interleaved accesses is only enabled for scalable
5037 // VF.
5038 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5040
5041 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5042 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5043 auto *SubVecTy =
5044 VectorType::get(VecVTy->getElementType(),
5045 VecVTy->getElementCount().divideCoefficientBy(Factor));
5046
5047 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5048 // Accesses having vector types that are a multiple of 128 bits can be
5049 // matched to more than one ldN/stN instruction.
5050 bool UseScalable;
5051 if (MinElts % Factor == 0 &&
5052 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5053 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5054 }
5055
5056 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5057 Alignment, AddressSpace, CostKind,
5058 UseMaskForCond, UseMaskForGaps);
5059}
5060
5065 for (auto *I : Tys) {
5066 if (!I->isVectorTy())
5067 continue;
5068 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5069 128)
5070 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5071 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5072 }
5073 return Cost;
5074}
5075
5077 return ST->getMaxInterleaveFactor();
5078}
5079
5080// For Falkor, we want to avoid having too many strided loads in a loop since
5081// that can exhaust the HW prefetcher resources. We adjust the unroller
5082// MaxCount preference below to attempt to ensure unrolling doesn't create too
5083// many strided loads.
5084static void
5087 enum { MaxStridedLoads = 7 };
5088 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5089 int StridedLoads = 0;
5090 // FIXME? We could make this more precise by looking at the CFG and
5091 // e.g. not counting loads in each side of an if-then-else diamond.
5092 for (const auto BB : L->blocks()) {
5093 for (auto &I : *BB) {
5094 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5095 if (!LMemI)
5096 continue;
5097
5098 Value *PtrValue = LMemI->getPointerOperand();
5099 if (L->isLoopInvariant(PtrValue))
5100 continue;
5101
5102 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5103 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5104 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5105 continue;
5106
5107 // FIXME? We could take pairing of unrolled load copies into account
5108 // by looking at the AddRec, but we would probably have to limit this
5109 // to loops with no stores or other memory optimization barriers.
5110 ++StridedLoads;
5111 // We've seen enough strided loads that seeing more won't make a
5112 // difference.
5113 if (StridedLoads > MaxStridedLoads / 2)
5114 return StridedLoads;
5115 }
5116 }
5117 return StridedLoads;
5118 };
5119
5120 int StridedLoads = countStridedLoads(L, SE);
5121 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5122 << " strided loads\n");
5123 // Pick the largest power of 2 unroll count that won't result in too many
5124 // strided loads.
5125 if (StridedLoads) {
5126 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5127 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5128 << UP.MaxCount << '\n');
5129 }
5130}
5131
5132// This function returns true if the loop:
5133// 1. Has a valid cost, and
5134// 2. Has a cost within the supplied budget.
5135// Otherwise it returns false.
5137 InstructionCost Budget,
5138 unsigned *FinalSize) {
5139 // Estimate the size of the loop.
5140 InstructionCost LoopCost = 0;
5141
5142 for (auto *BB : L->getBlocks()) {
5143 for (auto &I : *BB) {
5144 SmallVector<const Value *, 4> Operands(I.operand_values());
5145 InstructionCost Cost =
5146 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5147 // This can happen with intrinsics that don't currently have a cost model
5148 // or for some operations that require SVE.
5149 if (!Cost.isValid())
5150 return false;
5151
5152 LoopCost += Cost;
5153 if (LoopCost > Budget)
5154 return false;
5155 }
5156 }
5157
5158 if (FinalSize)
5159 *FinalSize = LoopCost.getValue();
5160 return true;
5161}
5162
5164 const AArch64TTIImpl &TTI) {
5165 // Only consider loops with unknown trip counts for which we can determine
5166 // a symbolic expression. Multi-exit loops with small known trip counts will
5167 // likely be unrolled anyway.
5168 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5170 return false;
5171
5172 // It might not be worth unrolling loops with low max trip counts. Restrict
5173 // this to max trip counts > 32 for now.
5174 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5175 if (MaxTC > 0 && MaxTC <= 32)
5176 return false;
5177
5178 // Make sure the loop size is <= 5.
5179 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5180 return false;
5181
5182 // Small search loops with multiple exits can be highly beneficial to unroll.
5183 // We only care about loops with exactly two exiting blocks, although each
5184 // block could jump to the same exit block.
5185 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5186 if (Blocks.size() != 2)
5187 return false;
5188
5189 if (any_of(Blocks, [](BasicBlock *BB) {
5190 return !isa<BranchInst>(BB->getTerminator());
5191 }))
5192 return false;
5193
5194 return true;
5195}
5196
5197/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5198/// OOO engine's wide instruction window and various predictors.
5199static void
5202 const AArch64TTIImpl &TTI) {
5203 // Limit loops with structure that is highly likely to benefit from runtime
5204 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5205 // likely with complex control flow). Note that the heuristics here may be
5206 // overly conservative and we err on the side of avoiding runtime unrolling
5207 // rather than unroll excessively. They are all subject to further refinement.
5208 if (!L->isInnermost() || L->getNumBlocks() > 8)
5209 return;
5210
5211 // Loops with multiple exits are handled by common code.
5212 if (!L->getExitBlock())
5213 return;
5214
5215 // Check if the loop contains any reductions that could be parallelized when
5216 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5217 // a multiple of 2.
5218 bool HasParellelizableReductions =
5219 L->getNumBlocks() == 1 &&
5220 any_of(L->getHeader()->phis(),
5221 [&SE, L](PHINode &Phi) {
5222 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5223 }) &&
5224 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5225 if (HasParellelizableReductions &&
5226 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5227 UP.Partial = true;
5228 UP.MaxCount = 4;
5229 UP.AddAdditionalAccumulators = true;
5230 }
5231
5232 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5234 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5235 SE.getSmallConstantMaxTripCount(L) <= 32))
5236 return;
5237
5238 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5239 return;
5240
5242 return;
5243
5244 // Limit to loops with trip counts that are cheap to expand.
5245 UP.SCEVExpansionBudget = 1;
5246
5247 if (HasParellelizableReductions) {
5248 UP.Runtime = true;
5250 UP.AddAdditionalAccumulators = true;
5251 }
5252
5253 // Try to unroll small loops, of few-blocks with low budget, if they have
5254 // load/store dependencies, to expose more parallel memory access streams,
5255 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5256 BasicBlock *Header = L->getHeader();
5257 BasicBlock *Latch = L->getLoopLatch();
5258 if (Header == Latch) {
5259 // Estimate the size of the loop.
5260 unsigned Size;
5261 unsigned Width = 10;
5262 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5263 return;
5264
5265 // Try to find an unroll count that maximizes the use of the instruction
5266 // window, i.e. trying to fetch as many instructions per cycle as possible.
5267 unsigned MaxInstsPerLine = 16;
5268 unsigned UC = 1;
5269 unsigned BestUC = 1;
5270 unsigned SizeWithBestUC = BestUC * Size;
5271 while (UC <= 8) {
5272 unsigned SizeWithUC = UC * Size;
5273 if (SizeWithUC > 48)
5274 break;
5275 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5276 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5277 BestUC = UC;
5278 SizeWithBestUC = BestUC * Size;
5279 }
5280 UC++;
5281 }
5282
5283 if (BestUC == 1)
5284 return;
5285
5286 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5288 for (auto *BB : L->blocks()) {
5289 for (auto &I : *BB) {
5291 if (!Ptr)
5292 continue;
5293 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5294 if (SE.isLoopInvariant(PtrSCEV, L))
5295 continue;
5296 if (isa<LoadInst>(&I)) {
5297 LoadedValuesPlus.insert(&I);
5298 // Include in-loop 1st users of loaded values.
5299 for (auto *U : I.users())
5300 if (L->contains(cast<Instruction>(U)))
5301 LoadedValuesPlus.insert(U);
5302 } else
5303 Stores.push_back(cast<StoreInst>(&I));
5304 }
5305 }
5306
5307 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5308 return LoadedValuesPlus.contains(SI->getOperand(0));
5309 }))
5310 return;
5311
5312 UP.Runtime = true;
5313 UP.DefaultUnrollRuntimeCount = BestUC;
5314 return;
5315 }
5316
5317 // Try to runtime-unroll loops with early-continues depending on loop-varying
5318 // loads; this helps with branch-prediction for the early-continues.
5319 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5321 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5322 !llvm::is_contained(Preds, Header) ||
5323 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5324 return;
5325
5326 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5327 [&](Instruction *I, unsigned Depth) -> bool {
5328 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5329 return false;
5330
5331 if (isa<LoadInst>(I))
5332 return true;
5333
5334 return any_of(I->operands(), [&](Value *V) {
5335 auto *I = dyn_cast<Instruction>(V);
5336 return I && DependsOnLoopLoad(I, Depth + 1);
5337 });
5338 };
5339 CmpPredicate Pred;
5340 Instruction *I;
5341 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5342 m_Value())) &&
5343 DependsOnLoopLoad(I, 0)) {
5344 UP.Runtime = true;
5345 }
5346}
5347
5350 OptimizationRemarkEmitter *ORE) const {
5351 // Enable partial unrolling and runtime unrolling.
5352 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5353
5354 UP.UpperBound = true;
5355
5356 // For inner loop, it is more likely to be a hot one, and the runtime check
5357 // can be promoted out from LICM pass, so the overhead is less, let's try
5358 // a larger threshold to unroll more loops.
5359 if (L->getLoopDepth() > 1)
5360 UP.PartialThreshold *= 2;
5361
5362 // Disable partial & runtime unrolling on -Os.
5364
5365 // Scan the loop: don't unroll loops with calls as this could prevent
5366 // inlining. Don't unroll auto-vectorized loops either, though do allow
5367 // unrolling of the scalar remainder.
5368 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5370 for (auto *BB : L->getBlocks()) {
5371 for (auto &I : *BB) {
5372 // Both auto-vectorized loops and the scalar remainder have the
5373 // isvectorized attribute, so differentiate between them by the presence
5374 // of vector instructions.
5375 if (IsVectorized && I.getType()->isVectorTy())
5376 return;
5377 if (isa<CallBase>(I)) {
5380 if (!isLoweredToCall(F))
5381 continue;
5382 return;
5383 }
5384
5385 SmallVector<const Value *, 4> Operands(I.operand_values());
5386 Cost += getInstructionCost(&I, Operands,
5388 }
5389 }
5390
5391 // Apply subtarget-specific unrolling preferences.
5392 if (ST->isAppleMLike())
5393 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5394 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5397
5398 // If this is a small, multi-exit loop similar to something like std::find,
5399 // then there is typically a performance improvement achieved by unrolling.
5400 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5401 UP.RuntimeUnrollMultiExit = true;
5402 UP.Runtime = true;
5403 // Limit unroll count.
5405 // Allow slightly more costly trip-count expansion to catch search loops
5406 // with pointer inductions.
5407 UP.SCEVExpansionBudget = 5;
5408 return;
5409 }
5410
5411 // Enable runtime unrolling for in-order models
5412 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5413 // checking for that case, we can ensure that the default behaviour is
5414 // unchanged
5415 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5416 !ST->getSchedModel().isOutOfOrder()) {
5417 UP.Runtime = true;
5418 UP.Partial = true;
5419 UP.UnrollRemainder = true;
5421
5422 UP.UnrollAndJam = true;
5424 }
5425
5426 // Force unrolling small loops can be very useful because of the branch
5427 // taken cost of the backedge.
5429 UP.Force = true;
5430}
5431
5436
5438 Type *ExpectedType,
5439 bool CanCreate) const {
5440 switch (Inst->getIntrinsicID()) {
5441 default:
5442 return nullptr;
5443 case Intrinsic::aarch64_neon_st2:
5444 case Intrinsic::aarch64_neon_st3:
5445 case Intrinsic::aarch64_neon_st4: {
5446 // Create a struct type
5447 StructType *ST = dyn_cast<StructType>(ExpectedType);
5448 if (!CanCreate || !ST)
5449 return nullptr;
5450 unsigned NumElts = Inst->arg_size() - 1;
5451 if (ST->getNumElements() != NumElts)
5452 return nullptr;
5453 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5454 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5455 return nullptr;
5456 }
5457 Value *Res = PoisonValue::get(ExpectedType);
5458 IRBuilder<> Builder(Inst);
5459 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5460 Value *L = Inst->getArgOperand(i);
5461 Res = Builder.CreateInsertValue(Res, L, i);
5462 }
5463 return Res;
5464 }
5465 case Intrinsic::aarch64_neon_ld2:
5466 case Intrinsic::aarch64_neon_ld3:
5467 case Intrinsic::aarch64_neon_ld4:
5468 if (Inst->getType() == ExpectedType)
5469 return Inst;
5470 return nullptr;
5471 }
5472}
5473
5475 MemIntrinsicInfo &Info) const {
5476 switch (Inst->getIntrinsicID()) {
5477 default:
5478 break;
5479 case Intrinsic::aarch64_neon_ld2:
5480 case Intrinsic::aarch64_neon_ld3:
5481 case Intrinsic::aarch64_neon_ld4:
5482 Info.ReadMem = true;
5483 Info.WriteMem = false;
5484 Info.PtrVal = Inst->getArgOperand(0);
5485 break;
5486 case Intrinsic::aarch64_neon_st2:
5487 case Intrinsic::aarch64_neon_st3:
5488 case Intrinsic::aarch64_neon_st4:
5489 Info.ReadMem = false;
5490 Info.WriteMem = true;
5491 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5492 break;
5493 }
5494
5495 switch (Inst->getIntrinsicID()) {
5496 default:
5497 return false;
5498 case Intrinsic::aarch64_neon_ld2:
5499 case Intrinsic::aarch64_neon_st2:
5500 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5501 break;
5502 case Intrinsic::aarch64_neon_ld3:
5503 case Intrinsic::aarch64_neon_st3:
5504 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5505 break;
5506 case Intrinsic::aarch64_neon_ld4:
5507 case Intrinsic::aarch64_neon_st4:
5508 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5509 break;
5510 }
5511 return true;
5512}
5513
5514/// See if \p I should be considered for address type promotion. We check if \p
5515/// I is a sext with right type and used in memory accesses. If it used in a
5516/// "complex" getelementptr, we allow it to be promoted without finding other
5517/// sext instructions that sign extended the same initial value. A getelementptr
5518/// is considered as "complex" if it has more than 2 operands.
5520 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5521 bool Considerable = false;
5522 AllowPromotionWithoutCommonHeader = false;
5523 if (!isa<SExtInst>(&I))
5524 return false;
5525 Type *ConsideredSExtType =
5526 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5527 if (I.getType() != ConsideredSExtType)
5528 return false;
5529 // See if the sext is the one with the right type and used in at least one
5530 // GetElementPtrInst.
5531 for (const User *U : I.users()) {
5532 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5533 Considerable = true;
5534 // A getelementptr is considered as "complex" if it has more than 2
5535 // operands. We will promote a SExt used in such complex GEP as we
5536 // expect some computation to be merged if they are done on 64 bits.
5537 if (GEPInst->getNumOperands() > 2) {
5538 AllowPromotionWithoutCommonHeader = true;
5539 break;
5540 }
5541 }
5542 }
5543 return Considerable;
5544}
5545
5547 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5548 if (!VF.isScalable())
5549 return true;
5550
5551 Type *Ty = RdxDesc.getRecurrenceType();
5552 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5553 return false;
5554
5555 switch (RdxDesc.getRecurrenceKind()) {
5556 case RecurKind::Sub:
5558 case RecurKind::Add:
5559 case RecurKind::FAdd:
5560 case RecurKind::And:
5561 case RecurKind::Or:
5562 case RecurKind::Xor:
5563 case RecurKind::SMin:
5564 case RecurKind::SMax:
5565 case RecurKind::UMin:
5566 case RecurKind::UMax:
5567 case RecurKind::FMin:
5568 case RecurKind::FMax:
5569 case RecurKind::FMulAdd:
5570 case RecurKind::AnyOf:
5572 return true;
5573 default:
5574 return false;
5575 }
5576}
5577
5580 FastMathFlags FMF,
5582 // The code-generator is currently not able to handle scalable vectors
5583 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5584 // it. This change will be removed when code-generation for these types is
5585 // sufficiently reliable.
5586 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5587 if (VTy->getElementCount() == ElementCount::getScalable(1))
5589
5590 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5591
5592 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5593 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5594
5595 InstructionCost LegalizationCost = 0;
5596 if (LT.first > 1) {
5597 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5598 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5599 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5600 }
5601
5602 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5603}
5604
5606 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5607 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5608 InstructionCost LegalizationCost = 0;
5609 if (LT.first > 1) {
5610 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5611 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5612 LegalizationCost *= LT.first - 1;
5613 }
5614
5615 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5616 assert(ISD && "Invalid opcode");
5617 // Add the final reduction cost for the legal horizontal reduction
5618 switch (ISD) {
5619 case ISD::ADD:
5620 case ISD::AND:
5621 case ISD::OR:
5622 case ISD::XOR:
5623 case ISD::FADD:
5624 return LegalizationCost + 2;
5625 default:
5627 }
5628}
5629
5632 std::optional<FastMathFlags> FMF,
5634 // The code-generator is currently not able to handle scalable vectors
5635 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5636 // it. This change will be removed when code-generation for these types is
5637 // sufficiently reliable.
5638 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5639 if (VTy->getElementCount() == ElementCount::getScalable(1))
5641
5643 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5644 InstructionCost BaseCost =
5645 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5646 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5647 // end up vectorizing for more computationally intensive loops.
5648 return BaseCost + FixedVTy->getNumElements();
5649 }
5650
5651 if (Opcode != Instruction::FAdd)
5653
5654 auto *VTy = cast<ScalableVectorType>(ValTy);
5656 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5657 Cost *= getMaxNumElements(VTy->getElementCount());
5658 return Cost;
5659 }
5660
5661 if (isa<ScalableVectorType>(ValTy))
5662 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5663
5664 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5665 MVT MTy = LT.second;
5666 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5667 assert(ISD && "Invalid opcode");
5668
5669 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5670 // instructions as twice a normal vector add, plus 1 for each legalization
5671 // step (LT.first). This is the only arithmetic vector reduction operation for
5672 // which we have an instruction.
5673 // OR, XOR and AND costs should match the codegen from:
5674 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5675 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5676 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5677 static const CostTblEntry CostTblNoPairwise[]{
5678 {ISD::ADD, MVT::v8i8, 2},
5679 {ISD::ADD, MVT::v16i8, 2},
5680 {ISD::ADD, MVT::v4i16, 2},
5681 {ISD::ADD, MVT::v8i16, 2},
5682 {ISD::ADD, MVT::v2i32, 2},
5683 {ISD::ADD, MVT::v4i32, 2},
5684 {ISD::ADD, MVT::v2i64, 2},
5685 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5686 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5687 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5688 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5689 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5690 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5691 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5692 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5693 {ISD::XOR, MVT::v16i8, 7},
5694 {ISD::XOR, MVT::v4i16, 4},
5695 {ISD::XOR, MVT::v8i16, 6},
5696 {ISD::XOR, MVT::v2i32, 3},
5697 {ISD::XOR, MVT::v4i32, 5},
5698 {ISD::XOR, MVT::v2i64, 3},
5699 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5700 {ISD::AND, MVT::v16i8, 7},
5701 {ISD::AND, MVT::v4i16, 4},
5702 {ISD::AND, MVT::v8i16, 6},
5703 {ISD::AND, MVT::v2i32, 3},
5704 {ISD::AND, MVT::v4i32, 5},
5705 {ISD::AND, MVT::v2i64, 3},
5706 };
5707 switch (ISD) {
5708 default:
5709 break;
5710 case ISD::FADD:
5711 if (Type *EltTy = ValTy->getScalarType();
5712 // FIXME: For half types without fullfp16 support, this could extend and
5713 // use a fp32 faddp reduction but current codegen unrolls.
5714 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5715 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5716 const unsigned NElts = MTy.getVectorNumElements();
5717 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5718 isPowerOf2_32(NElts))
5719 // Reduction corresponding to series of fadd instructions is lowered to
5720 // series of faddp instructions. faddp has latency/throughput that
5721 // matches fadd instruction and hence, every faddp instruction can be
5722 // considered to have a relative cost = 1 with
5723 // CostKind = TCK_RecipThroughput.
5724 // An faddp will pairwise add vector elements, so the size of input
5725 // vector reduces by half every time, requiring
5726 // #(faddp instructions) = log2_32(NElts).
5727 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5728 }
5729 break;
5730 case ISD::ADD:
5731 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5732 return (LT.first - 1) + Entry->Cost;
5733 break;
5734 case ISD::XOR:
5735 case ISD::AND:
5736 case ISD::OR:
5737 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5738 if (!Entry)
5739 break;
5740 auto *ValVTy = cast<FixedVectorType>(ValTy);
5741 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5742 isPowerOf2_32(ValVTy->getNumElements())) {
5743 InstructionCost ExtraCost = 0;
5744 if (LT.first != 1) {
5745 // Type needs to be split, so there is an extra cost of LT.first - 1
5746 // arithmetic ops.
5747 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5748 MTy.getVectorNumElements());
5749 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5750 ExtraCost *= LT.first - 1;
5751 }
5752 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5753 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5754 return Cost + ExtraCost;
5755 }
5756 break;
5757 }
5758 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5759}
5760
5762 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5763 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5764 EVT VecVT = TLI->getValueType(DL, VecTy);
5765 EVT ResVT = TLI->getValueType(DL, ResTy);
5766
5767 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5768 VecVT.getSizeInBits() >= 64) {
5769 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5770
5771 // The legal cases are:
5772 // UADDLV 8/16/32->32
5773 // UADDLP 32->64
5774 unsigned RevVTSize = ResVT.getSizeInBits();
5775 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5776 RevVTSize <= 32) ||
5777 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5778 RevVTSize <= 32) ||
5779 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5780 RevVTSize <= 64))
5781 return (LT.first - 1) * 2 + 2;
5782 }
5783
5784 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5785 CostKind);
5786}
5787
5789AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5790 Type *ResTy, VectorType *VecTy,
5792 EVT VecVT = TLI->getValueType(DL, VecTy);
5793 EVT ResVT = TLI->getValueType(DL, ResTy);
5794
5795 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5796 RedOpcode == Instruction::Add) {
5797 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5798
5799 // The legal cases with dotprod are
5800 // UDOT 8->32
5801 // Which requires an additional uaddv to sum the i32 values.
5802 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5803 ResVT == MVT::i32)
5804 return LT.first + 2;
5805 }
5806
5807 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5808 CostKind);
5809}
5810
5814 static const CostTblEntry ShuffleTbl[] = {
5815 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5816 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5817 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5818 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5819 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5820 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5821 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5822 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5823 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5824 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5825 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5826 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5827 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5828 };
5829
5830 // The code-generator is currently not able to handle scalable vectors
5831 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5832 // it. This change will be removed when code-generation for these types is
5833 // sufficiently reliable.
5836
5837 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5838 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5839 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5840 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5841 : LT.second;
5842 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5843 InstructionCost LegalizationCost = 0;
5844 if (Index < 0) {
5845 LegalizationCost =
5846 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5848 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5850 }
5851
5852 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5853 // Cost performed on a promoted type.
5854 if (LT.second.getScalarType() == MVT::i1) {
5855 LegalizationCost +=
5856 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5858 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5860 }
5861 const auto *Entry =
5862 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5863 assert(Entry && "Illegal Type for Splice");
5864 LegalizationCost += Entry->Cost;
5865 return LegalizationCost * LT.first;
5866}
5867
5869 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5871 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5872 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5874
5876 return Invalid;
5877
5878 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5879 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5880 return Invalid;
5881
5882 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5883 Opcode != Instruction::FAdd) ||
5884 OpAExtend == TTI::PR_None)
5885 return Invalid;
5886
5887 // Floating-point partial reductions are invalid if `reassoc` and `contract`
5888 // are not allowed.
5889 if (AccumType->isFloatingPointTy()) {
5890 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5891 if (!FMF->allowReassoc() || !FMF->allowContract())
5892 return Invalid;
5893 } else {
5894 assert(!FMF &&
5895 "FastMathFlags only apply to floating-point partial reductions");
5896 }
5897
5898 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5899 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5900 "Unexpected values for OpBExtend or InputTypeB");
5901
5902 // We only support multiply binary operations for now, and for muls we
5903 // require the types being extended to be the same.
5904 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
5905 InputTypeA != InputTypeB))
5906 return Invalid;
5907
5908 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5909 if (IsUSDot && !ST->hasMatMulInt8())
5910 return Invalid;
5911
5912 unsigned Ratio =
5913 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5914 if (VF.getKnownMinValue() <= Ratio)
5915 return Invalid;
5916
5917 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5918 VectorType *AccumVectorType =
5919 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5920 // We don't yet support all kinds of legalization.
5921 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5922 EVT::getEVT(AccumVectorType));
5923 switch (TC.first) {
5924 default:
5925 return Invalid;
5929 // The legalised type (e.g. after splitting) must be legal too.
5930 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5932 return Invalid;
5933 break;
5934 }
5935
5936 std::pair<InstructionCost, MVT> AccumLT =
5937 getTypeLegalizationCost(AccumVectorType);
5938 std::pair<InstructionCost, MVT> InputLT =
5939 getTypeLegalizationCost(InputVectorType);
5940
5941 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5942
5943 // The sub/negation cannot be folded into the operands of
5944 // ISD::PARTIAL_REDUCE_*MLA, so make the cost more expensive.
5945 if (Opcode == Instruction::Sub)
5946 Cost += 8;
5947
5948 // Prefer using full types by costing half-full input types as more expensive.
5949 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5951 // FIXME: This can be removed after the cost of the extends are folded into
5952 // the dot-product expression in VPlan, after landing:
5953 // https://github.com/llvm/llvm-project/pull/147302
5954 Cost *= 2;
5955
5956 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5957 // i16 -> i64 is natively supported for udot/sdot
5958 if (AccumLT.second.getScalarType() == MVT::i64 &&
5959 InputLT.second.getScalarType() == MVT::i16)
5960 return Cost;
5961 // i16 -> i32 is natively supported with SVE2p1
5962 if (AccumLT.second.getScalarType() == MVT::i32 &&
5963 InputLT.second.getScalarType() == MVT::i16 &&
5964 (ST->hasSVE2p1() || ST->hasSME2()))
5965 return Cost;
5966 // i8 -> i64 is supported with an extra level of extends
5967 if (AccumLT.second.getScalarType() == MVT::i64 &&
5968 InputLT.second.getScalarType() == MVT::i8)
5969 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5970 // because it requires two extra extends on the inputs. But if we'd change
5971 // that now, a regular reduction would be cheaper because the costs of
5972 // the extends in the IR are still counted. This can be fixed
5973 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5974 return Cost;
5975 }
5976
5977 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5978 if (ST->isSVEorStreamingSVEAvailable() ||
5979 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5980 ST->hasDotProd())) {
5981 if (AccumLT.second.getScalarType() == MVT::i32 &&
5982 InputLT.second.getScalarType() == MVT::i8)
5983 return Cost;
5984 }
5985
5986 // f16 -> f32 is natively supported for fdot
5987 if (Opcode == Instruction::FAdd && (ST->hasSME2() || ST->hasSVE2p1())) {
5988 if (AccumLT.second.getScalarType() == MVT::f32 &&
5989 InputLT.second.getScalarType() == MVT::f16 &&
5990 AccumLT.second.getVectorMinNumElements() == 4 &&
5991 InputLT.second.getVectorMinNumElements() == 8)
5992 return Cost;
5993 // Floating-point types aren't promoted, so expanding the partial reduction
5994 // is more expensive.
5995 return Cost + 20;
5996 }
5997
5998 // Add additional cost for the extends that would need to be inserted.
5999 return Cost + 2;
6000}
6001
6004 VectorType *SrcTy, ArrayRef<int> Mask,
6005 TTI::TargetCostKind CostKind, int Index,
6007 const Instruction *CxtI) const {
6008 assert((Mask.empty() || DstTy->isScalableTy() ||
6009 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6010 "Expected the Mask to match the return size if given");
6011 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6012 "Expected the same scalar types");
6013 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6014
6015 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6016 // into smaller vectors and sum the cost of each shuffle.
6017 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6018 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6019 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6020 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6021 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6022 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6023 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6024 // cost than just the load.
6025 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6028 return std::max<InstructionCost>(1, LT.first / 4);
6029
6030 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6031 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6032 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6033 // cost than just the store.
6034 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6036 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6038 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6039 return LT.first;
6040
6041 unsigned TpNumElts = Mask.size();
6042 unsigned LTNumElts = LT.second.getVectorNumElements();
6043 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6044 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6045 LT.second.getVectorElementCount());
6047 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6048 PreviousCosts;
6049 for (unsigned N = 0; N < NumVecs; N++) {
6050 SmallVector<int> NMask;
6051 // Split the existing mask into chunks of size LTNumElts. Track the source
6052 // sub-vectors to ensure the result has at most 2 inputs.
6053 unsigned Source1 = -1U, Source2 = -1U;
6054 unsigned NumSources = 0;
6055 for (unsigned E = 0; E < LTNumElts; E++) {
6056 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6058 if (MaskElt < 0) {
6060 continue;
6061 }
6062
6063 // Calculate which source from the input this comes from and whether it
6064 // is new to us.
6065 unsigned Source = MaskElt / LTNumElts;
6066 if (NumSources == 0) {
6067 Source1 = Source;
6068 NumSources = 1;
6069 } else if (NumSources == 1 && Source != Source1) {
6070 Source2 = Source;
6071 NumSources = 2;
6072 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6073 NumSources++;
6074 }
6075
6076 // Add to the new mask. For the NumSources>2 case these are not correct,
6077 // but are only used for the modular lane number.
6078 if (Source == Source1)
6079 NMask.push_back(MaskElt % LTNumElts);
6080 else if (Source == Source2)
6081 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6082 else
6083 NMask.push_back(MaskElt % LTNumElts);
6084 }
6085 // Check if we have already generated this sub-shuffle, which means we
6086 // will have already generated the output. For example a <16 x i32> splat
6087 // will be the same sub-splat 4 times, which only needs to be generated
6088 // once and reused.
6089 auto Result =
6090 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6091 // Check if it was already in the map (already costed).
6092 if (!Result.second)
6093 continue;
6094 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6095 // getShuffleCost. If not then cost it using the worst case as the number
6096 // of element moves into a new vector.
6097 InstructionCost NCost =
6098 NumSources <= 2
6099 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6101 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6102 CxtI)
6103 : LTNumElts;
6104 Result.first->second = NCost;
6105 Cost += NCost;
6106 }
6107 return Cost;
6108 }
6109
6110 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6111 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6112 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6113 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6114 // This currently only handles low or high extracts to prevent SLP vectorizer
6115 // regressions.
6116 // Note that SVE's ext instruction is destructive, but it can be fused with
6117 // a movprfx to act like a constructive instruction.
6118 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6119 if (LT.second.getFixedSizeInBits() >= 128 &&
6120 cast<FixedVectorType>(SubTp)->getNumElements() ==
6121 LT.second.getVectorNumElements() / 2) {
6122 if (Index == 0)
6123 return 0;
6124 if (Index == (int)LT.second.getVectorNumElements() / 2)
6125 return 1;
6126 }
6128 }
6129 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6130 // the code to handle length-changing shuffles.
6131 if (Kind == TTI::SK_InsertSubvector) {
6132 LT = getTypeLegalizationCost(DstTy);
6133 SrcTy = DstTy;
6134 }
6135
6136 // Check for identity masks, which we can treat as free for both fixed and
6137 // scalable vector paths.
6138 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6139 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6140 all_of(enumerate(Mask), [](const auto &M) {
6141 return M.value() < 0 || M.value() == (int)M.index();
6142 }))
6143 return 0;
6144
6145 // Segmented shuffle matching.
6146 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6147 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6148 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6150
6152 unsigned Segments =
6154 unsigned SegmentElts = VTy->getNumElements() / Segments;
6155
6156 // dupq zd.t, zn.t[idx]
6157 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6158 ST->isSVEorStreamingSVEAvailable() &&
6159 isDUPQMask(Mask, Segments, SegmentElts))
6160 return LT.first;
6161
6162 // mov zd.q, vn
6163 if (ST->isSVEorStreamingSVEAvailable() &&
6164 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6165 return LT.first;
6166 }
6167
6168 // Check for broadcast loads, which are supported by the LD1R instruction.
6169 // In terms of code-size, the shuffle vector is free when a load + dup get
6170 // folded into a LD1R. That's what we check and return here. For performance
6171 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6172 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6173 // that we model the load + dup sequence slightly higher because LD1R is a
6174 // high latency instruction.
6175 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6176 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6177 if (IsLoad && LT.second.isVector() &&
6178 isLegalBroadcastLoad(SrcTy->getElementType(),
6179 LT.second.getVectorElementCount()))
6180 return 0;
6181 }
6182
6183 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6184 // from the perfect shuffle tables.
6185 if (Mask.size() == 4 &&
6186 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6187 (SrcTy->getScalarSizeInBits() == 16 ||
6188 SrcTy->getScalarSizeInBits() == 32) &&
6189 all_of(Mask, [](int E) { return E < 8; }))
6190 return getPerfectShuffleCost(Mask);
6191
6192 // Check for other shuffles that are not SK_ kinds but we have native
6193 // instructions for, for example ZIP and UZP.
6194 unsigned Unused;
6195 if (LT.second.isFixedLengthVector() &&
6196 LT.second.getVectorNumElements() == Mask.size() &&
6197 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6198 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6199 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6200 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6201 Kind == TTI::SK_InsertSubvector) &&
6202 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6203 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6204 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6205 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6206 LT.second.getVectorNumElements(), 16) ||
6207 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6208 LT.second.getVectorNumElements(), 32) ||
6209 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6210 LT.second.getVectorNumElements(), 64) ||
6211 // Check for non-zero lane splats
6212 all_of(drop_begin(Mask),
6213 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6214 return 1;
6215
6216 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6217 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6218 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6219 static const CostTblEntry ShuffleTbl[] = {
6220 // Broadcast shuffle kinds can be performed with 'dup'.
6221 {TTI::SK_Broadcast, MVT::v8i8, 1},
6222 {TTI::SK_Broadcast, MVT::v16i8, 1},
6223 {TTI::SK_Broadcast, MVT::v4i16, 1},
6224 {TTI::SK_Broadcast, MVT::v8i16, 1},
6225 {TTI::SK_Broadcast, MVT::v2i32, 1},
6226 {TTI::SK_Broadcast, MVT::v4i32, 1},
6227 {TTI::SK_Broadcast, MVT::v2i64, 1},
6228 {TTI::SK_Broadcast, MVT::v4f16, 1},
6229 {TTI::SK_Broadcast, MVT::v8f16, 1},
6230 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6231 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6232 {TTI::SK_Broadcast, MVT::v2f32, 1},
6233 {TTI::SK_Broadcast, MVT::v4f32, 1},
6234 {TTI::SK_Broadcast, MVT::v2f64, 1},
6235 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6236 // 'zip1/zip2' instructions.
6237 {TTI::SK_Transpose, MVT::v8i8, 1},
6238 {TTI::SK_Transpose, MVT::v16i8, 1},
6239 {TTI::SK_Transpose, MVT::v4i16, 1},
6240 {TTI::SK_Transpose, MVT::v8i16, 1},
6241 {TTI::SK_Transpose, MVT::v2i32, 1},
6242 {TTI::SK_Transpose, MVT::v4i32, 1},
6243 {TTI::SK_Transpose, MVT::v2i64, 1},
6244 {TTI::SK_Transpose, MVT::v4f16, 1},
6245 {TTI::SK_Transpose, MVT::v8f16, 1},
6246 {TTI::SK_Transpose, MVT::v4bf16, 1},
6247 {TTI::SK_Transpose, MVT::v8bf16, 1},
6248 {TTI::SK_Transpose, MVT::v2f32, 1},
6249 {TTI::SK_Transpose, MVT::v4f32, 1},
6250 {TTI::SK_Transpose, MVT::v2f64, 1},
6251 // Select shuffle kinds.
6252 // TODO: handle vXi8/vXi16.
6253 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6254 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6255 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6256 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6257 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6258 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6259 // PermuteSingleSrc shuffle kinds.
6260 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6261 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6262 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6263 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6264 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6265 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6266 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6267 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6268 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6269 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6270 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6271 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6272 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6273 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6274 // Reverse can be lowered with `rev`.
6275 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6276 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6277 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6278 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6279 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6280 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6281 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6282 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6283 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6284 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6285 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6286 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6287 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6288 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6289 // Splice can all be lowered as `ext`.
6290 {TTI::SK_Splice, MVT::v2i32, 1},
6291 {TTI::SK_Splice, MVT::v4i32, 1},
6292 {TTI::SK_Splice, MVT::v2i64, 1},
6293 {TTI::SK_Splice, MVT::v2f32, 1},
6294 {TTI::SK_Splice, MVT::v4f32, 1},
6295 {TTI::SK_Splice, MVT::v2f64, 1},
6296 {TTI::SK_Splice, MVT::v8f16, 1},
6297 {TTI::SK_Splice, MVT::v8bf16, 1},
6298 {TTI::SK_Splice, MVT::v8i16, 1},
6299 {TTI::SK_Splice, MVT::v16i8, 1},
6300 {TTI::SK_Splice, MVT::v4f16, 1},
6301 {TTI::SK_Splice, MVT::v4bf16, 1},
6302 {TTI::SK_Splice, MVT::v4i16, 1},
6303 {TTI::SK_Splice, MVT::v8i8, 1},
6304 // Broadcast shuffle kinds for scalable vectors
6305 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6306 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6307 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6308 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6309 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6310 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6311 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6312 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6313 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6314 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6315 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6316 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6317 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6318 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6319 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6320 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6321 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6322 // Handle the cases for vector.reverse with scalable vectors
6323 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6324 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6325 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6326 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6327 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6328 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6329 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6330 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6331 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6332 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6333 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6334 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6335 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6336 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6337 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6338 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6339 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6340 };
6341 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6342 return LT.first * Entry->Cost;
6343 }
6344
6345 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6346 return getSpliceCost(SrcTy, Index, CostKind);
6347
6348 // Inserting a subvector can often be done with either a D, S or H register
6349 // move, so long as the inserted vector is "aligned".
6350 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6351 LT.second.getSizeInBits() <= 128 && SubTp) {
6352 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6353 if (SubLT.second.isVector()) {
6354 int NumElts = LT.second.getVectorNumElements();
6355 int NumSubElts = SubLT.second.getVectorNumElements();
6356 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6357 return SubLT.first;
6358 }
6359 }
6360
6361 // Restore optimal kind.
6362 if (IsExtractSubvector)
6364 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6365 Args, CxtI);
6366}
6367
6370 const DominatorTree &DT) {
6371 const auto &Strides = DenseMap<Value *, const SCEV *>();
6372 for (BasicBlock *BB : TheLoop->blocks()) {
6373 // Scan the instructions in the block and look for addresses that are
6374 // consecutive and decreasing.
6375 for (Instruction &I : *BB) {
6376 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6378 Type *AccessTy = getLoadStoreType(&I);
6379 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6380 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6381 .value_or(0) < 0)
6382 return true;
6383 }
6384 }
6385 }
6386 return false;
6387}
6388
6390 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6392 // For cases like post-LTO vectorization, when we eventually know the trip
6393 // count, epilogue with fixed-width vectorization can be deleted if the trip
6394 // count is less than the epilogue iterations. That's why we prefer
6395 // fixed-width vectorization in epilogue in case of equal costs.
6396 if (IsEpilogue)
6397 return true;
6398 return ST->useFixedOverScalableIfEqualCost();
6399}
6400
6402 return ST->getEpilogueVectorizationMinVF();
6403}
6404
6406 if (!ST->hasSVE())
6407 return false;
6408
6409 // We don't currently support vectorisation with interleaving for SVE - with
6410 // such loops we're better off not using tail-folding. This gives us a chance
6411 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6412 if (TFI->IAI->hasGroups())
6413 return false;
6414
6416 if (TFI->LVL->getReductionVars().size())
6418 if (TFI->LVL->getFixedOrderRecurrences().size())
6420
6421 // We call this to discover whether any load/store pointers in the loop have
6422 // negative strides. This will require extra work to reverse the loop
6423 // predicate, which may be expensive.
6426 *TFI->LVL->getDominatorTree()))
6430
6431 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6432 Required))
6433 return false;
6434
6435 // Don't tail-fold for tight loops where we would be better off interleaving
6436 // with an unpredicated loop.
6437 unsigned NumInsns = 0;
6438 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6439 NumInsns += BB->sizeWithoutDebug();
6440 }
6441
6442 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6443 return NumInsns >= SVETailFoldInsnThreshold;
6444}
6445
6448 StackOffset BaseOffset, bool HasBaseReg,
6449 int64_t Scale, unsigned AddrSpace) const {
6450 // Scaling factors are not free at all.
6451 // Operands | Rt Latency
6452 // -------------------------------------------
6453 // Rt, [Xn, Xm] | 4
6454 // -------------------------------------------
6455 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6456 // Rt, [Xn, Wm, <extend> #imm] |
6458 AM.BaseGV = BaseGV;
6459 AM.BaseOffs = BaseOffset.getFixed();
6460 AM.HasBaseReg = HasBaseReg;
6461 AM.Scale = Scale;
6462 AM.ScalableOffset = BaseOffset.getScalable();
6463 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6464 // Scale represents reg2 * scale, thus account for 1 if
6465 // it is not equal to 0 or 1.
6466 return AM.Scale != 0 && AM.Scale != 1;
6468}
6469
6471 const Instruction *I) const {
6473 // For the binary operators (e.g. or) we need to be more careful than
6474 // selects, here we only transform them if they are already at a natural
6475 // break point in the code - the end of a block with an unconditional
6476 // terminator.
6477 if (I->getOpcode() == Instruction::Or &&
6478 isa<BranchInst>(I->getNextNode()) &&
6479 cast<BranchInst>(I->getNextNode())->isUnconditional())
6480 return true;
6481
6482 if (I->getOpcode() == Instruction::Add ||
6483 I->getOpcode() == Instruction::Sub)
6484 return true;
6485 }
6487}
6488
6491 const TargetTransformInfo::LSRCost &C2) const {
6492 // AArch64 specific here is adding the number of instructions to the
6493 // comparison (though not as the first consideration, as some targets do)
6494 // along with changing the priority of the base additions.
6495 // TODO: Maybe a more nuanced tradeoff between instruction count
6496 // and number of registers? To be investigated at a later date.
6497 if (EnableLSRCostOpt)
6498 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6499 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6500 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6501 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6502
6504}
6505
6506static bool isSplatShuffle(Value *V) {
6507 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6508 return all_equal(Shuf->getShuffleMask());
6509 return false;
6510}
6511
6512/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6513/// or upper half of the vector elements.
6514static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6515 bool AllowSplat = false) {
6516 // Scalable types can't be extract shuffle vectors.
6517 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6518 return false;
6519
6520 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6521 auto *FullTy = FullV->getType();
6522 auto *HalfTy = HalfV->getType();
6523 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6524 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6525 };
6526
6527 auto extractHalf = [](Value *FullV, Value *HalfV) {
6528 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6529 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6530 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6531 };
6532
6533 ArrayRef<int> M1, M2;
6534 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6535 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6536 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6537 return false;
6538
6539 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6540 // it is not checked as an extract below.
6541 if (AllowSplat && isSplatShuffle(Op1))
6542 S1Op1 = nullptr;
6543 if (AllowSplat && isSplatShuffle(Op2))
6544 S2Op1 = nullptr;
6545
6546 // Check that the operands are half as wide as the result and we extract
6547 // half of the elements of the input vectors.
6548 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6549 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6550 return false;
6551
6552 // Check the mask extracts either the lower or upper half of vector
6553 // elements.
6554 int M1Start = 0;
6555 int M2Start = 0;
6556 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6557 if ((S1Op1 &&
6558 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6559 (S2Op1 &&
6560 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6561 return false;
6562
6563 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6564 (M2Start != 0 && M2Start != (NumElements / 2)))
6565 return false;
6566 if (S1Op1 && S2Op1 && M1Start != M2Start)
6567 return false;
6568
6569 return true;
6570}
6571
6572/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6573/// of the vector elements.
6574static bool areExtractExts(Value *Ext1, Value *Ext2) {
6575 auto areExtDoubled = [](Instruction *Ext) {
6576 return Ext->getType()->getScalarSizeInBits() ==
6577 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6578 };
6579
6580 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6581 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6582 !areExtDoubled(cast<Instruction>(Ext1)) ||
6583 !areExtDoubled(cast<Instruction>(Ext2)))
6584 return false;
6585
6586 return true;
6587}
6588
6589/// Check if Op could be used with vmull_high_p64 intrinsic.
6591 Value *VectorOperand = nullptr;
6592 ConstantInt *ElementIndex = nullptr;
6593 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6594 m_ConstantInt(ElementIndex))) &&
6595 ElementIndex->getValue() == 1 &&
6596 isa<FixedVectorType>(VectorOperand->getType()) &&
6597 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6598}
6599
6600/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6601static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6603}
6604
6606 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6607 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6608 if (!GEP || GEP->getNumOperands() != 2)
6609 return false;
6610
6611 Value *Base = GEP->getOperand(0);
6612 Value *Offsets = GEP->getOperand(1);
6613
6614 // We only care about scalar_base+vector_offsets.
6615 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6616 return false;
6617
6618 // Sink extends that would allow us to use 32-bit offset vectors.
6619 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6620 auto *OffsetsInst = cast<Instruction>(Offsets);
6621 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6622 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6623 Ops.push_back(&GEP->getOperandUse(1));
6624 }
6625
6626 // Sink the GEP.
6627 return true;
6628}
6629
6630/// We want to sink following cases:
6631/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6632/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6634 if (match(Op, m_VScale()))
6635 return true;
6636 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6638 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6639 return true;
6640 }
6641 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6643 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6644 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6645 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6646 return true;
6647 }
6648 return false;
6649}
6650
6651static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6652
6653/// Check if sinking \p I's operands to I's basic block is profitable, because
6654/// the operands can be folded into a target instruction, e.g.
6655/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6659 switch (II->getIntrinsicID()) {
6660 case Intrinsic::aarch64_neon_smull:
6661 case Intrinsic::aarch64_neon_umull:
6662 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6663 /*AllowSplat=*/true)) {
6664 Ops.push_back(&II->getOperandUse(0));
6665 Ops.push_back(&II->getOperandUse(1));
6666 return true;
6667 }
6668 [[fallthrough]];
6669
6670 case Intrinsic::fma:
6671 case Intrinsic::fmuladd:
6672 if (isa<VectorType>(I->getType()) &&
6673 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6674 !ST->hasFullFP16())
6675 return false;
6676
6677 if (isFNeg(II->getOperand(0)))
6678 Ops.push_back(&II->getOperandUse(0));
6679 if (isFNeg(II->getOperand(1)))
6680 Ops.push_back(&II->getOperandUse(1));
6681
6682 [[fallthrough]];
6683 case Intrinsic::aarch64_neon_sqdmull:
6684 case Intrinsic::aarch64_neon_sqdmulh:
6685 case Intrinsic::aarch64_neon_sqrdmulh:
6686 // Sink splats for index lane variants
6687 if (isSplatShuffle(II->getOperand(0)))
6688 Ops.push_back(&II->getOperandUse(0));
6689 if (isSplatShuffle(II->getOperand(1)))
6690 Ops.push_back(&II->getOperandUse(1));
6691 return !Ops.empty();
6692 case Intrinsic::aarch64_neon_fmlal:
6693 case Intrinsic::aarch64_neon_fmlal2:
6694 case Intrinsic::aarch64_neon_fmlsl:
6695 case Intrinsic::aarch64_neon_fmlsl2:
6696 // Sink splats for index lane variants
6697 if (isSplatShuffle(II->getOperand(1)))
6698 Ops.push_back(&II->getOperandUse(1));
6699 if (isSplatShuffle(II->getOperand(2)))
6700 Ops.push_back(&II->getOperandUse(2));
6701 return !Ops.empty();
6702 case Intrinsic::aarch64_sve_ptest_first:
6703 case Intrinsic::aarch64_sve_ptest_last:
6704 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6705 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6706 Ops.push_back(&II->getOperandUse(0));
6707 return !Ops.empty();
6708 case Intrinsic::aarch64_sme_write_horiz:
6709 case Intrinsic::aarch64_sme_write_vert:
6710 case Intrinsic::aarch64_sme_writeq_horiz:
6711 case Intrinsic::aarch64_sme_writeq_vert: {
6712 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6713 if (!Idx || Idx->getOpcode() != Instruction::Add)
6714 return false;
6715 Ops.push_back(&II->getOperandUse(1));
6716 return true;
6717 }
6718 case Intrinsic::aarch64_sme_read_horiz:
6719 case Intrinsic::aarch64_sme_read_vert:
6720 case Intrinsic::aarch64_sme_readq_horiz:
6721 case Intrinsic::aarch64_sme_readq_vert:
6722 case Intrinsic::aarch64_sme_ld1b_vert:
6723 case Intrinsic::aarch64_sme_ld1h_vert:
6724 case Intrinsic::aarch64_sme_ld1w_vert:
6725 case Intrinsic::aarch64_sme_ld1d_vert:
6726 case Intrinsic::aarch64_sme_ld1q_vert:
6727 case Intrinsic::aarch64_sme_st1b_vert:
6728 case Intrinsic::aarch64_sme_st1h_vert:
6729 case Intrinsic::aarch64_sme_st1w_vert:
6730 case Intrinsic::aarch64_sme_st1d_vert:
6731 case Intrinsic::aarch64_sme_st1q_vert:
6732 case Intrinsic::aarch64_sme_ld1b_horiz:
6733 case Intrinsic::aarch64_sme_ld1h_horiz:
6734 case Intrinsic::aarch64_sme_ld1w_horiz:
6735 case Intrinsic::aarch64_sme_ld1d_horiz:
6736 case Intrinsic::aarch64_sme_ld1q_horiz:
6737 case Intrinsic::aarch64_sme_st1b_horiz:
6738 case Intrinsic::aarch64_sme_st1h_horiz:
6739 case Intrinsic::aarch64_sme_st1w_horiz:
6740 case Intrinsic::aarch64_sme_st1d_horiz:
6741 case Intrinsic::aarch64_sme_st1q_horiz: {
6742 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6743 if (!Idx || Idx->getOpcode() != Instruction::Add)
6744 return false;
6745 Ops.push_back(&II->getOperandUse(3));
6746 return true;
6747 }
6748 case Intrinsic::aarch64_neon_pmull:
6749 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6750 return false;
6751 Ops.push_back(&II->getOperandUse(0));
6752 Ops.push_back(&II->getOperandUse(1));
6753 return true;
6754 case Intrinsic::aarch64_neon_pmull64:
6755 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6756 II->getArgOperand(1)))
6757 return false;
6758 Ops.push_back(&II->getArgOperandUse(0));
6759 Ops.push_back(&II->getArgOperandUse(1));
6760 return true;
6761 case Intrinsic::masked_gather:
6762 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6763 return false;
6764 Ops.push_back(&II->getArgOperandUse(0));
6765 return true;
6766 case Intrinsic::masked_scatter:
6767 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6768 return false;
6769 Ops.push_back(&II->getArgOperandUse(1));
6770 return true;
6771 default:
6772 return false;
6773 }
6774 }
6775
6776 auto ShouldSinkCondition = [](Value *Cond,
6777 SmallVectorImpl<Use *> &Ops) -> bool {
6779 return false;
6781 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6782 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6783 return false;
6784 if (isa<CmpInst>(II->getOperand(0)))
6785 Ops.push_back(&II->getOperandUse(0));
6786 return true;
6787 };
6788
6789 switch (I->getOpcode()) {
6790 case Instruction::GetElementPtr:
6791 case Instruction::Add:
6792 case Instruction::Sub:
6793 // Sink vscales closer to uses for better isel
6794 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6795 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6796 Ops.push_back(&I->getOperandUse(Op));
6797 return true;
6798 }
6799 }
6800 break;
6801 case Instruction::Select: {
6802 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6803 return false;
6804
6805 Ops.push_back(&I->getOperandUse(0));
6806 return true;
6807 }
6808 case Instruction::Br: {
6809 if (cast<BranchInst>(I)->isUnconditional())
6810 return false;
6811
6812 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6813 return false;
6814
6815 Ops.push_back(&I->getOperandUse(0));
6816 return true;
6817 }
6818 case Instruction::FMul:
6819 // fmul with contract flag can be combined with fadd into fma.
6820 // Sinking fneg into this block enables fmls pattern.
6821 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6822 if (isFNeg(I->getOperand(0)))
6823 Ops.push_back(&I->getOperandUse(0));
6824 if (isFNeg(I->getOperand(1)))
6825 Ops.push_back(&I->getOperandUse(1));
6826 }
6827 break;
6828
6829 default:
6830 break;
6831 }
6832
6833 if (!I->getType()->isVectorTy())
6834 return !Ops.empty();
6835
6836 switch (I->getOpcode()) {
6837 case Instruction::Sub:
6838 case Instruction::Add: {
6839 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6840 return false;
6841
6842 // If the exts' operands extract either the lower or upper elements, we
6843 // can sink them too.
6844 auto Ext1 = cast<Instruction>(I->getOperand(0));
6845 auto Ext2 = cast<Instruction>(I->getOperand(1));
6846 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6847 Ops.push_back(&Ext1->getOperandUse(0));
6848 Ops.push_back(&Ext2->getOperandUse(0));
6849 }
6850
6851 Ops.push_back(&I->getOperandUse(0));
6852 Ops.push_back(&I->getOperandUse(1));
6853
6854 return true;
6855 }
6856 case Instruction::Or: {
6857 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6858 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6859 if (ST->hasNEON()) {
6860 Instruction *OtherAnd, *IA, *IB;
6861 Value *MaskValue;
6862 // MainAnd refers to And instruction that has 'Not' as one of its operands
6863 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6864 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6865 m_Instruction(IA)))))) {
6866 if (match(OtherAnd,
6867 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6868 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6869 ? cast<Instruction>(I->getOperand(1))
6870 : cast<Instruction>(I->getOperand(0));
6871
6872 // Both Ands should be in same basic block as Or
6873 if (I->getParent() != MainAnd->getParent() ||
6874 I->getParent() != OtherAnd->getParent())
6875 return false;
6876
6877 // Non-mask operands of both Ands should also be in same basic block
6878 if (I->getParent() != IA->getParent() ||
6879 I->getParent() != IB->getParent())
6880 return false;
6881
6882 Ops.push_back(
6883 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6884 Ops.push_back(&I->getOperandUse(0));
6885 Ops.push_back(&I->getOperandUse(1));
6886
6887 return true;
6888 }
6889 }
6890 }
6891
6892 return false;
6893 }
6894 case Instruction::Mul: {
6895 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6896 auto *Ty = cast<VectorType>(V->getType());
6897 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6898 if (Ty->isScalableTy())
6899 return false;
6900
6901 // Indexed variants of Mul exist for i16 and i32 element types only.
6902 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6903 };
6904
6905 int NumZExts = 0, NumSExts = 0;
6906 for (auto &Op : I->operands()) {
6907 // Make sure we are not already sinking this operand
6908 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6909 continue;
6910
6911 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6912 auto *Ext = cast<Instruction>(Op);
6913 auto *ExtOp = Ext->getOperand(0);
6914 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6915 Ops.push_back(&Ext->getOperandUse(0));
6916 Ops.push_back(&Op);
6917
6918 if (isa<SExtInst>(Ext)) {
6919 NumSExts++;
6920 } else {
6921 NumZExts++;
6922 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6923 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6924 I->getType()->getScalarSizeInBits())
6925 NumSExts++;
6926 }
6927
6928 continue;
6929 }
6930
6932 if (!Shuffle)
6933 continue;
6934
6935 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6936 // operand and the s/zext can help create indexed s/umull. This is
6937 // especially useful to prevent i64 mul being scalarized.
6938 if (isSplatShuffle(Shuffle) &&
6939 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6940 Ops.push_back(&Shuffle->getOperandUse(0));
6941 Ops.push_back(&Op);
6942 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6943 NumSExts++;
6944 else
6945 NumZExts++;
6946 continue;
6947 }
6948
6949 Value *ShuffleOperand = Shuffle->getOperand(0);
6950 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6951 if (!Insert)
6952 continue;
6953
6954 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6955 if (!OperandInstr)
6956 continue;
6957
6958 ConstantInt *ElementConstant =
6959 dyn_cast<ConstantInt>(Insert->getOperand(2));
6960 // Check that the insertelement is inserting into element 0
6961 if (!ElementConstant || !ElementConstant->isZero())
6962 continue;
6963
6964 unsigned Opcode = OperandInstr->getOpcode();
6965 if (Opcode == Instruction::SExt)
6966 NumSExts++;
6967 else if (Opcode == Instruction::ZExt)
6968 NumZExts++;
6969 else {
6970 // If we find that the top bits are known 0, then we can sink and allow
6971 // the backend to generate a umull.
6972 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6973 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6974 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6975 continue;
6976 NumZExts++;
6977 }
6978
6979 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6980 // the And, just to hoist it again back to the load.
6981 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6982 Ops.push_back(&Insert->getOperandUse(1));
6983 Ops.push_back(&Shuffle->getOperandUse(0));
6984 Ops.push_back(&Op);
6985 }
6986
6987 // It is profitable to sink if we found two of the same type of extends.
6988 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6989 return true;
6990
6991 // Otherwise, see if we should sink splats for indexed variants.
6992 if (!ShouldSinkSplatForIndexedVariant(I))
6993 return false;
6994
6995 Ops.clear();
6996 if (isSplatShuffle(I->getOperand(0)))
6997 Ops.push_back(&I->getOperandUse(0));
6998 if (isSplatShuffle(I->getOperand(1)))
6999 Ops.push_back(&I->getOperandUse(1));
7000
7001 return !Ops.empty();
7002 }
7003 case Instruction::FMul: {
7004 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7005 if (I->getType()->isScalableTy())
7006 return !Ops.empty();
7007
7008 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7009 !ST->hasFullFP16())
7010 return !Ops.empty();
7011
7012 // Sink splats for index lane variants
7013 if (isSplatShuffle(I->getOperand(0)))
7014 Ops.push_back(&I->getOperandUse(0));
7015 if (isSplatShuffle(I->getOperand(1)))
7016 Ops.push_back(&I->getOperandUse(1));
7017 return !Ops.empty();
7018 }
7019 default:
7020 return false;
7021 }
7022 return false;
7023}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
unsigned countLeadingOnes() const
Definition APInt.h:1639
void negate()
Negate this APInt in place.
Definition APInt.h:1483
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
unsigned logBase2() const
Definition APInt.h:1776
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:72
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1110
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:574
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:594
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:561
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:579
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1944
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2258
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2473
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1717
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1854
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1867
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:589
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2249
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:730
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...