LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 StringRef AttributeStr =
257 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
258 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261 return AArch64::getFMVPriority(Features);
262}
263
265 return F.hasFnAttribute("fmv-features");
266}
267
268const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
269 AArch64::FeatureExecuteOnly,
270};
271
273 const Function *Callee) const {
274 SMECallAttrs CallAttrs(*Caller, *Callee);
275
276 // Never inline a function explicitly marked as being streaming,
277 // into a non-streaming function. Assume it was marked as streaming
278 // for a reason.
279 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
281 return false;
282
283 // When inlining, we should consider the body of the function, not the
284 // interface.
285 if (CallAttrs.callee().hasStreamingBody()) {
286 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
287 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
288 }
289
290 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
291 return false;
292
293 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
294 CallAttrs.requiresPreservingZT0() ||
295 CallAttrs.requiresPreservingAllZAState()) {
296 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
297 return false;
298 }
299
300 const TargetMachine &TM = getTLI()->getTargetMachine();
301 const FeatureBitset &CallerBits =
302 TM.getSubtargetImpl(*Caller)->getFeatureBits();
303 const FeatureBitset &CalleeBits =
304 TM.getSubtargetImpl(*Callee)->getFeatureBits();
305 // Adjust the feature bitsets by inverting some of the bits. This is needed
306 // for target features that represent restrictions rather than capabilities,
307 // for example a "+execute-only" callee can be inlined into a caller without
308 // "+execute-only", but not vice versa.
309 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
310 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
311
312 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
313}
314
316 const Function *Callee,
317 ArrayRef<Type *> Types) const {
318 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
319 return false;
320
321 // We need to ensure that argument promotion does not attempt to promote
322 // pointers to fixed-length vector types larger than 128 bits like
323 // <8 x float> (and pointers to aggregate types which have such fixed-length
324 // vector type members) into the values of the pointees. Such vector types
325 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
326 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
327 // types can be safely treated as 128-bit NEON types and they cannot be
328 // distinguished in IR.
329 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
330 auto FVTy = dyn_cast<FixedVectorType>(Ty);
331 return FVTy &&
332 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
333 }))
334 return false;
335
336 return true;
337}
338
339unsigned
341 unsigned DefaultCallPenalty) const {
342 // This function calculates a penalty for executing Call in F.
343 //
344 // There are two ways this function can be called:
345 // (1) F:
346 // call from F -> G (the call here is Call)
347 //
348 // For (1), Call.getCaller() == F, so it will always return a high cost if
349 // a streaming-mode change is required (thus promoting the need to inline the
350 // function)
351 //
352 // (2) F:
353 // call from F -> G (the call here is not Call)
354 // G:
355 // call from G -> H (the call here is Call)
356 //
357 // For (2), if after inlining the body of G into F the call to H requires a
358 // streaming-mode change, and the call to G from F would also require a
359 // streaming-mode change, then there is benefit to do the streaming-mode
360 // change only once and avoid inlining of G into F.
361
362 SMEAttrs FAttrs(*F);
363 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
364
365 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
366 if (F == Call.getCaller()) // (1)
367 return CallPenaltyChangeSM * DefaultCallPenalty;
368 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
369 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
370 }
371
372 return DefaultCallPenalty;
373}
374
381
382/// Calculate the cost of materializing a 64-bit value. This helper
383/// method might only calculate a fraction of a larger immediate. Therefore it
384/// is valid to return a cost of ZERO.
386 // Check if the immediate can be encoded within an instruction.
387 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
388 return 0;
389
390 if (Val < 0)
391 Val = ~Val;
392
393 // Calculate how many moves we will need to materialize this constant.
395 AArch64_IMM::expandMOVImm(Val, 64, Insn);
396 return Insn.size();
397}
398
399/// Calculate the cost of materializing the given constant.
403 assert(Ty->isIntegerTy());
404
405 unsigned BitSize = Ty->getPrimitiveSizeInBits();
406 if (BitSize == 0)
407 return ~0U;
408
409 // Sign-extend all constants to a multiple of 64-bit.
410 APInt ImmVal = Imm;
411 if (BitSize & 0x3f)
412 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
413
414 // Split the constant into 64-bit chunks and calculate the cost for each
415 // chunk.
417 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
418 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
419 int64_t Val = Tmp.getSExtValue();
420 Cost += getIntImmCost(Val);
421 }
422 // We need at least one instruction to materialze the constant.
423 return std::max<InstructionCost>(1, Cost);
424}
425
427 const APInt &Imm, Type *Ty,
429 Instruction *Inst) const {
430 assert(Ty->isIntegerTy());
431
432 unsigned BitSize = Ty->getPrimitiveSizeInBits();
433 // There is no cost model for constants with a bit size of 0. Return TCC_Free
434 // here, so that constant hoisting will ignore this constant.
435 if (BitSize == 0)
436 return TTI::TCC_Free;
437
438 unsigned ImmIdx = ~0U;
439 switch (Opcode) {
440 default:
441 return TTI::TCC_Free;
442 case Instruction::GetElementPtr:
443 // Always hoist the base address of a GetElementPtr.
444 if (Idx == 0)
445 return 2 * TTI::TCC_Basic;
446 return TTI::TCC_Free;
447 case Instruction::Store:
448 ImmIdx = 0;
449 break;
450 case Instruction::Add:
451 case Instruction::Sub:
452 case Instruction::Mul:
453 case Instruction::UDiv:
454 case Instruction::SDiv:
455 case Instruction::URem:
456 case Instruction::SRem:
457 case Instruction::And:
458 case Instruction::Or:
459 case Instruction::Xor:
460 case Instruction::ICmp:
461 ImmIdx = 1;
462 break;
463 // Always return TCC_Free for the shift value of a shift instruction.
464 case Instruction::Shl:
465 case Instruction::LShr:
466 case Instruction::AShr:
467 if (Idx == 1)
468 return TTI::TCC_Free;
469 break;
470 case Instruction::Trunc:
471 case Instruction::ZExt:
472 case Instruction::SExt:
473 case Instruction::IntToPtr:
474 case Instruction::PtrToInt:
475 case Instruction::BitCast:
476 case Instruction::PHI:
477 case Instruction::Call:
478 case Instruction::Select:
479 case Instruction::Ret:
480 case Instruction::Load:
481 break;
482 }
483
484 if (Idx == ImmIdx) {
485 int NumConstants = (BitSize + 63) / 64;
487 return (Cost <= NumConstants * TTI::TCC_Basic)
488 ? static_cast<int>(TTI::TCC_Free)
489 : Cost;
490 }
492}
493
496 const APInt &Imm, Type *Ty,
498 assert(Ty->isIntegerTy());
499
500 unsigned BitSize = Ty->getPrimitiveSizeInBits();
501 // There is no cost model for constants with a bit size of 0. Return TCC_Free
502 // here, so that constant hoisting will ignore this constant.
503 if (BitSize == 0)
504 return TTI::TCC_Free;
505
506 // Most (all?) AArch64 intrinsics do not support folding immediates into the
507 // selected instruction, so we compute the materialization cost for the
508 // immediate directly.
509 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
511
512 switch (IID) {
513 default:
514 return TTI::TCC_Free;
515 case Intrinsic::sadd_with_overflow:
516 case Intrinsic::uadd_with_overflow:
517 case Intrinsic::ssub_with_overflow:
518 case Intrinsic::usub_with_overflow:
519 case Intrinsic::smul_with_overflow:
520 case Intrinsic::umul_with_overflow:
521 if (Idx == 1) {
522 int NumConstants = (BitSize + 63) / 64;
524 return (Cost <= NumConstants * TTI::TCC_Basic)
525 ? static_cast<int>(TTI::TCC_Free)
526 : Cost;
527 }
528 break;
529 case Intrinsic::experimental_stackmap:
530 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
531 return TTI::TCC_Free;
532 break;
533 case Intrinsic::experimental_patchpoint_void:
534 case Intrinsic::experimental_patchpoint:
535 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 case Intrinsic::experimental_gc_statepoint:
539 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
540 return TTI::TCC_Free;
541 break;
542 }
544}
545
547AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
548 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
549 if (TyWidth == 32 || TyWidth == 64)
551 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
552 return TTI::PSK_Software;
553}
554
555static bool isUnpackedVectorVT(EVT VecVT) {
556 return VecVT.isScalableVector() &&
558}
559
561 const IntrinsicCostAttributes &ICA) {
562 // We need to know at least the number of elements in the vector of buckets
563 // and the size of each element to update.
564 if (ICA.getArgTypes().size() < 2)
566
567 // Only interested in costing for the hardware instruction from SVE2.
568 if (!ST->hasSVE2())
570
571 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
572 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
573 unsigned TotalHistCnts = 1;
574
575 unsigned EltSize = EltTy->getScalarSizeInBits();
576 // Only allow (up to 64b) integers or pointers
577 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
579
580 // FIXME: We should be able to generate histcnt for fixed-length vectors
581 // using ptrue with a specific VL.
582 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
583 unsigned EC = VTy->getElementCount().getKnownMinValue();
584 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
586
587 // HistCnt only supports 32b and 64b element types
588 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
589
590 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
592
593 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
594 TotalHistCnts = EC / NaturalVectorWidth;
595
596 return InstructionCost(BaseHistCntCost * TotalHistCnts);
597 }
598
600}
601
605 // The code-generator is currently not able to handle scalable vectors
606 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
607 // it. This change will be removed when code-generation for these types is
608 // sufficiently reliable.
609 auto *RetTy = ICA.getReturnType();
610 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
611 if (VTy->getElementCount() == ElementCount::getScalable(1))
613
614 switch (ICA.getID()) {
615 case Intrinsic::experimental_vector_histogram_add: {
616 InstructionCost HistCost = getHistogramCost(ST, ICA);
617 // If the cost isn't valid, we may still be able to scalarize
618 if (HistCost.isValid())
619 return HistCost;
620 break;
621 }
622 case Intrinsic::umin:
623 case Intrinsic::umax:
624 case Intrinsic::smin:
625 case Intrinsic::smax: {
626 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
627 MVT::v8i16, MVT::v2i32, MVT::v4i32,
628 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
629 MVT::nxv2i64};
630 auto LT = getTypeLegalizationCost(RetTy);
631 // v2i64 types get converted to cmp+bif hence the cost of 2
632 if (LT.second == MVT::v2i64)
633 return LT.first * 2;
634 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
635 return LT.first;
636 break;
637 }
638 case Intrinsic::sadd_sat:
639 case Intrinsic::ssub_sat:
640 case Intrinsic::uadd_sat:
641 case Intrinsic::usub_sat: {
642 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::v2i64};
645 auto LT = getTypeLegalizationCost(RetTy);
646 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
647 // need to extend the type, as it uses shr(qadd(shl, shl)).
648 unsigned Instrs =
649 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
650 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
651 return LT.first * Instrs;
652
654 uint64_t VectorSize = TS.getKnownMinValue();
655
656 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
657 return LT.first * Instrs;
658
659 break;
660 }
661 case Intrinsic::abs: {
662 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
663 MVT::v8i16, MVT::v2i32, MVT::v4i32,
664 MVT::v2i64};
665 auto LT = getTypeLegalizationCost(RetTy);
666 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
667 return LT.first;
668 break;
669 }
670 case Intrinsic::bswap: {
671 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
672 MVT::v4i32, MVT::v2i64};
673 auto LT = getTypeLegalizationCost(RetTy);
674 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
675 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
676 return LT.first;
677 break;
678 }
679 case Intrinsic::fma:
680 case Intrinsic::fmuladd: {
681 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
682 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
683 Type *EltTy = RetTy->getScalarType();
684 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
685 (EltTy->isHalfTy() && ST->hasFullFP16()))
686 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
687 break;
688 }
689 case Intrinsic::stepvector: {
690 InstructionCost Cost = 1; // Cost of the `index' instruction
691 auto LT = getTypeLegalizationCost(RetTy);
692 // Legalisation of illegal vectors involves an `index' instruction plus
693 // (LT.first - 1) vector adds.
694 if (LT.first > 1) {
695 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
696 InstructionCost AddCost =
697 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
698 Cost += AddCost * (LT.first - 1);
699 }
700 return Cost;
701 }
702 case Intrinsic::vector_extract:
703 case Intrinsic::vector_insert: {
704 // If both the vector and subvector types are legal types and the index
705 // is 0, then this should be a no-op or simple operation; return a
706 // relatively low cost.
707
708 // If arguments aren't actually supplied, then we cannot determine the
709 // value of the index. We also want to skip predicate types.
710 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
712 break;
713
714 LLVMContext &C = RetTy->getContext();
715 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
716 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
717 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
718 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
719 // Skip this if either the vector or subvector types are unpacked
720 // SVE types; they may get lowered to stack stores and loads.
721 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
722 break;
723
725 getTLI()->getTypeConversion(C, SubVecVT);
727 getTLI()->getTypeConversion(C, VecVT);
728 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
729 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
730 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
731 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
732 return TTI::TCC_Free;
733 break;
734 }
735 case Intrinsic::bitreverse: {
736 static const CostTblEntry BitreverseTbl[] = {
737 {Intrinsic::bitreverse, MVT::i32, 1},
738 {Intrinsic::bitreverse, MVT::i64, 1},
739 {Intrinsic::bitreverse, MVT::v8i8, 1},
740 {Intrinsic::bitreverse, MVT::v16i8, 1},
741 {Intrinsic::bitreverse, MVT::v4i16, 2},
742 {Intrinsic::bitreverse, MVT::v8i16, 2},
743 {Intrinsic::bitreverse, MVT::v2i32, 2},
744 {Intrinsic::bitreverse, MVT::v4i32, 2},
745 {Intrinsic::bitreverse, MVT::v1i64, 2},
746 {Intrinsic::bitreverse, MVT::v2i64, 2},
747 };
748 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
749 const auto *Entry =
750 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
751 if (Entry) {
752 // Cost Model is using the legal type(i32) that i8 and i16 will be
753 // converted to +1 so that we match the actual lowering cost
754 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
755 TLI->getValueType(DL, RetTy, true) == MVT::i16)
756 return LegalisationCost.first * Entry->Cost + 1;
757
758 return LegalisationCost.first * Entry->Cost;
759 }
760 break;
761 }
762 case Intrinsic::ctpop: {
763 if (!ST->hasNEON()) {
764 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
765 return getTypeLegalizationCost(RetTy).first * 12;
766 }
767 static const CostTblEntry CtpopCostTbl[] = {
768 {ISD::CTPOP, MVT::v2i64, 4},
769 {ISD::CTPOP, MVT::v4i32, 3},
770 {ISD::CTPOP, MVT::v8i16, 2},
771 {ISD::CTPOP, MVT::v16i8, 1},
772 {ISD::CTPOP, MVT::i64, 4},
773 {ISD::CTPOP, MVT::v2i32, 3},
774 {ISD::CTPOP, MVT::v4i16, 2},
775 {ISD::CTPOP, MVT::v8i8, 1},
776 {ISD::CTPOP, MVT::i32, 5},
777 };
778 auto LT = getTypeLegalizationCost(RetTy);
779 MVT MTy = LT.second;
780 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
781 // Extra cost of +1 when illegal vector types are legalized by promoting
782 // the integer type.
783 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
784 RetTy->getScalarSizeInBits()
785 ? 1
786 : 0;
787 return LT.first * Entry->Cost + ExtraCost;
788 }
789 break;
790 }
791 case Intrinsic::sadd_with_overflow:
792 case Intrinsic::uadd_with_overflow:
793 case Intrinsic::ssub_with_overflow:
794 case Intrinsic::usub_with_overflow:
795 case Intrinsic::smul_with_overflow:
796 case Intrinsic::umul_with_overflow: {
797 static const CostTblEntry WithOverflowCostTbl[] = {
798 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
799 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
800 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
801 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
802 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
803 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
804 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
805 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
806 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
807 {Intrinsic::usub_with_overflow, MVT::i8, 3},
808 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
809 {Intrinsic::usub_with_overflow, MVT::i16, 3},
810 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
811 {Intrinsic::usub_with_overflow, MVT::i32, 1},
812 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
813 {Intrinsic::usub_with_overflow, MVT::i64, 1},
814 {Intrinsic::smul_with_overflow, MVT::i8, 5},
815 {Intrinsic::umul_with_overflow, MVT::i8, 4},
816 {Intrinsic::smul_with_overflow, MVT::i16, 5},
817 {Intrinsic::umul_with_overflow, MVT::i16, 4},
818 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
819 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
820 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
821 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
822 };
823 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
824 if (MTy.isSimple())
825 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
826 MTy.getSimpleVT()))
827 return Entry->Cost;
828 break;
829 }
830 case Intrinsic::fptosi_sat:
831 case Intrinsic::fptoui_sat: {
832 if (ICA.getArgTypes().empty())
833 break;
834 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
835 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
836 EVT MTy = TLI->getValueType(DL, RetTy);
837 // Check for the legal types, which are where the size of the input and the
838 // output are the same, or we are using cvt f64->i32 or f32->i64.
839 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
840 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
841 LT.second == MVT::v2f64)) {
842 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
843 (LT.second == MVT::f64 && MTy == MVT::i32) ||
844 (LT.second == MVT::f32 && MTy == MVT::i64)))
845 return LT.first;
846 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
847 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
848 MTy.getScalarSizeInBits() == 64)
849 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
850 }
851 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
852 // f32.
853 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
854 return LT.first + getIntrinsicInstrCost(
855 {ICA.getID(),
856 RetTy,
857 {ICA.getArgTypes()[0]->getWithNewType(
858 Type::getFloatTy(RetTy->getContext()))}},
859 CostKind);
860 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
861 (LT.second == MVT::f16 && MTy == MVT::i64) ||
862 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
863 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
864 return LT.first;
865 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
866 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
867 MTy.getScalarSizeInBits() == 32)
868 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
869 // Extending vector types v8f16->v8i32. These current scalarize but the
870 // codegen could be better.
871 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
872 MTy.getScalarSizeInBits() == 64)
873 return MTy.getVectorNumElements() * 3;
874
875 // If we can we use a legal convert followed by a min+max
876 if ((LT.second.getScalarType() == MVT::f32 ||
877 LT.second.getScalarType() == MVT::f64 ||
878 LT.second.getScalarType() == MVT::f16) &&
879 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
880 Type *LegalTy =
881 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
882 if (LT.second.isVector())
883 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
885 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
886 LegalTy, {LegalTy, LegalTy});
888 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
889 LegalTy, {LegalTy, LegalTy});
891 return LT.first * Cost +
892 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
893 : 1);
894 }
895 // Otherwise we need to follow the default expansion that clamps the value
896 // using a float min/max with a fcmp+sel for nan handling when signed.
897 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
898 RetTy = RetTy->getScalarType();
899 if (LT.second.isVector()) {
900 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
901 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
902 }
903 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
905 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
907 Cost +=
908 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
910 if (IsSigned) {
911 Type *CondTy = RetTy->getWithNewBitWidth(1);
912 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
914 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
916 }
917 return LT.first * Cost;
918 }
919 case Intrinsic::fshl:
920 case Intrinsic::fshr: {
921 if (ICA.getArgs().empty())
922 break;
923
924 // TODO: Add handling for fshl where third argument is not a constant.
925 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
926 if (!OpInfoZ.isConstant())
927 break;
928
929 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
930 if (OpInfoZ.isUniform()) {
931 static const CostTblEntry FshlTbl[] = {
932 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
933 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
934 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
935 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
936 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
937 // to avoid having to duplicate the costs.
938 const auto *Entry =
939 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
940 if (Entry)
941 return LegalisationCost.first * Entry->Cost;
942 }
943
944 auto TyL = getTypeLegalizationCost(RetTy);
945 if (!RetTy->isIntegerTy())
946 break;
947
948 // Estimate cost manually, as types like i8 and i16 will get promoted to
949 // i32 and CostTableLookup will ignore the extra conversion cost.
950 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
951 RetTy->getScalarSizeInBits() < 64) ||
952 (RetTy->getScalarSizeInBits() % 64 != 0);
953 unsigned ExtraCost = HigherCost ? 1 : 0;
954 if (RetTy->getScalarSizeInBits() == 32 ||
955 RetTy->getScalarSizeInBits() == 64)
956 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
957 // extr instruction.
958 else if (HigherCost)
959 ExtraCost = 1;
960 else
961 break;
962 return TyL.first + ExtraCost;
963 }
964 case Intrinsic::get_active_lane_mask: {
965 auto RetTy = cast<VectorType>(ICA.getReturnType());
966 EVT RetVT = getTLI()->getValueType(DL, RetTy);
967 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
968 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
969 break;
970
971 if (RetTy->isScalableTy()) {
972 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
974 break;
975
976 auto LT = getTypeLegalizationCost(RetTy);
977 InstructionCost Cost = LT.first;
978 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
979 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
980 // nxv32i1 = get_active_lane_mask(base, idx) ->
981 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
982 if (ST->hasSVE2p1() || ST->hasSME2()) {
983 Cost /= 2;
984 if (Cost == 1)
985 return Cost;
986 }
987
988 // If more than one whilelo intrinsic is required, include the extra cost
989 // required by the saturating add & select required to increment the
990 // start value after the first intrinsic call.
991 Type *OpTy = ICA.getArgTypes()[0];
992 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
993 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
994 Type *CondTy = OpTy->getWithNewBitWidth(1);
995 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
997 return Cost + (SplitCost * (Cost - 1));
998 } else if (!getTLI()->isTypeLegal(RetVT)) {
999 // We don't have enough context at this point to determine if the mask
1000 // is going to be kept live after the block, which will force the vXi1
1001 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1002 // For now, we just assume the vectorizer created this intrinsic and
1003 // the result will be the input for a PHI. In this case the cost will
1004 // be extremely high for fixed-width vectors.
1005 // NOTE: getScalarizationOverhead returns a cost that's far too
1006 // pessimistic for the actual generated codegen. In reality there are
1007 // two instructions generated per lane.
1008 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1009 }
1010 break;
1011 }
1012 case Intrinsic::experimental_vector_match: {
1013 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1014 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1015 unsigned SearchSize = NeedleTy->getNumElements();
1016 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1017 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1018 // Neoverse V3, these are cheap operations with the same latency as a
1019 // vector ADD. In most cases, however, we also need to do an extra DUP.
1020 // For fixed-length vectors we currently need an extra five--six
1021 // instructions besides the MATCH.
1023 if (isa<FixedVectorType>(RetTy))
1024 Cost += 10;
1025 return Cost;
1026 }
1027 break;
1028 }
1029 case Intrinsic::experimental_cttz_elts: {
1030 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1031 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1032 // This will consist of a SVE brkb and a cntp instruction. These
1033 // typically have the same latency and half the throughput as a vector
1034 // add instruction.
1035 return 4;
1036 }
1037 break;
1038 }
1039 case Intrinsic::experimental_vector_extract_last_active:
1040 if (ST->isSVEorStreamingSVEAvailable()) {
1041 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1042 // This should turn into chained clastb instructions.
1043 return LegalCost;
1044 }
1045 break;
1046 default:
1047 break;
1048 }
1050}
1051
1052/// The function will remove redundant reinterprets casting in the presence
1053/// of the control flow
1054static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1055 IntrinsicInst &II) {
1057 auto RequiredType = II.getType();
1058
1059 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1060 assert(PN && "Expected Phi Node!");
1061
1062 // Don't create a new Phi unless we can remove the old one.
1063 if (!PN->hasOneUse())
1064 return std::nullopt;
1065
1066 for (Value *IncValPhi : PN->incoming_values()) {
1067 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1068 if (!Reinterpret ||
1069 Reinterpret->getIntrinsicID() !=
1070 Intrinsic::aarch64_sve_convert_to_svbool ||
1071 RequiredType != Reinterpret->getArgOperand(0)->getType())
1072 return std::nullopt;
1073 }
1074
1075 // Create the new Phi
1076 IC.Builder.SetInsertPoint(PN);
1077 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1078 Worklist.push_back(PN);
1079
1080 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1081 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1082 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1083 Worklist.push_back(Reinterpret);
1084 }
1085
1086 // Cleanup Phi Node and reinterprets
1087 return IC.replaceInstUsesWith(II, NPN);
1088}
1089
1090// A collection of properties common to SVE intrinsics that allow for combines
1091// to be written without needing to know the specific intrinsic.
1093 //
1094 // Helper routines for common intrinsic definitions.
1095 //
1096
1097 // e.g. llvm.aarch64.sve.add pg, op1, op2
1098 // with IID ==> llvm.aarch64.sve.add_u
1099 static SVEIntrinsicInfo
1106
1107 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1114
1115 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1121
1122 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1128
1129 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1130 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1131 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1132 return SVEIntrinsicInfo()
1135 }
1136
1137 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1138 // llvm.aarch64.sve.ld1 pg, ptr
1145
1146 // All properties relate to predication and thus having a general predicate
1147 // is the minimum requirement to say there is intrinsic info to act on.
1148 explicit operator bool() const { return hasGoverningPredicate(); }
1149
1150 //
1151 // Properties relating to the governing predicate.
1152 //
1153
1155 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1156 }
1157
1159 assert(hasGoverningPredicate() && "Propery not set!");
1160 return GoverningPredicateIdx;
1161 }
1162
1164 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1165 GoverningPredicateIdx = Index;
1166 return *this;
1167 }
1168
1169 //
1170 // Properties relating to operations the intrinsic could be transformed into.
1171 // NOTE: This does not mean such a transformation is always possible, but the
1172 // knowledge makes it possible to reuse existing optimisations without needing
1173 // to embed specific handling for each intrinsic. For example, instruction
1174 // simplification can be used to optimise an intrinsic's active lanes.
1175 //
1176
1178 return UndefIntrinsic != Intrinsic::not_intrinsic;
1179 }
1180
1182 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1183 return UndefIntrinsic;
1184 }
1185
1187 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1188 UndefIntrinsic = IID;
1189 return *this;
1190 }
1191
1192 bool hasMatchingIROpode() const { return IROpcode != 0; }
1193
1194 unsigned getMatchingIROpode() const {
1195 assert(hasMatchingIROpode() && "Propery not set!");
1196 return IROpcode;
1197 }
1198
1200 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1201 IROpcode = Opcode;
1202 return *this;
1203 }
1204
1205 //
1206 // Properties relating to the result of inactive lanes.
1207 //
1208
1210 return ResultLanes == InactiveLanesTakenFromOperand;
1211 }
1212
1214 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1215 return OperandIdxForInactiveLanes;
1216 }
1217
1219 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1220 ResultLanes = InactiveLanesTakenFromOperand;
1221 OperandIdxForInactiveLanes = Index;
1222 return *this;
1223 }
1224
1226 return ResultLanes == InactiveLanesAreNotDefined;
1227 }
1228
1230 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1231 ResultLanes = InactiveLanesAreNotDefined;
1232 return *this;
1233 }
1234
1236 return ResultLanes == InactiveLanesAreUnused;
1237 }
1238
1240 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1241 ResultLanes = InactiveLanesAreUnused;
1242 return *this;
1243 }
1244
1245 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1246 // inactiveLanesAreZeroed =
1247 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1248 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1249
1251 ResultIsZeroInitialized = true;
1252 return *this;
1253 }
1254
1255 //
1256 // The first operand of unary merging operations is typically only used to
1257 // set the result for inactive lanes. Knowing this allows us to deadcode the
1258 // operand when we can prove there are no inactive lanes.
1259 //
1260
1262 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1263 }
1264
1266 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1267 return OperandIdxWithNoActiveLanes;
1268 }
1269
1271 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1272 OperandIdxWithNoActiveLanes = Index;
1273 return *this;
1274 }
1275
1276private:
1277 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1278
1279 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1280 unsigned IROpcode = 0;
1281
1282 enum PredicationStyle {
1284 InactiveLanesTakenFromOperand,
1285 InactiveLanesAreNotDefined,
1286 InactiveLanesAreUnused
1287 } ResultLanes = Uninitialized;
1288
1289 bool ResultIsZeroInitialized = false;
1290 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1291 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1292};
1293
1295 // Some SVE intrinsics do not use scalable vector types, but since they are
1296 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1297 if (!isa<ScalableVectorType>(II.getType()) &&
1298 all_of(II.args(), [&](const Value *V) {
1299 return !isa<ScalableVectorType>(V->getType());
1300 }))
1301 return SVEIntrinsicInfo();
1302
1303 Intrinsic::ID IID = II.getIntrinsicID();
1304 switch (IID) {
1305 default:
1306 break;
1307 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1308 case Intrinsic::aarch64_sve_fcvt_f16f32:
1309 case Intrinsic::aarch64_sve_fcvt_f16f64:
1310 case Intrinsic::aarch64_sve_fcvt_f32f16:
1311 case Intrinsic::aarch64_sve_fcvt_f32f64:
1312 case Intrinsic::aarch64_sve_fcvt_f64f16:
1313 case Intrinsic::aarch64_sve_fcvt_f64f32:
1314 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1315 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1316 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1317 case Intrinsic::aarch64_sve_fcvtzs:
1318 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1319 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1320 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1321 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1322 case Intrinsic::aarch64_sve_fcvtzu:
1323 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1324 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1325 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1326 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1327 case Intrinsic::aarch64_sve_scvtf:
1328 case Intrinsic::aarch64_sve_scvtf_f16i32:
1329 case Intrinsic::aarch64_sve_scvtf_f16i64:
1330 case Intrinsic::aarch64_sve_scvtf_f32i64:
1331 case Intrinsic::aarch64_sve_scvtf_f64i32:
1332 case Intrinsic::aarch64_sve_ucvtf:
1333 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1334 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1335 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1336 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1338
1339 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1340 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1341 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1342 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1344
1345 case Intrinsic::aarch64_sve_fabd:
1346 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1347 case Intrinsic::aarch64_sve_fadd:
1348 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1349 .setMatchingIROpcode(Instruction::FAdd);
1350 case Intrinsic::aarch64_sve_fdiv:
1351 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1352 .setMatchingIROpcode(Instruction::FDiv);
1353 case Intrinsic::aarch64_sve_fmax:
1354 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1355 case Intrinsic::aarch64_sve_fmaxnm:
1356 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1357 case Intrinsic::aarch64_sve_fmin:
1358 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1359 case Intrinsic::aarch64_sve_fminnm:
1360 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1361 case Intrinsic::aarch64_sve_fmla:
1362 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1363 case Intrinsic::aarch64_sve_fmls:
1364 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1365 case Intrinsic::aarch64_sve_fmul:
1366 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1367 .setMatchingIROpcode(Instruction::FMul);
1368 case Intrinsic::aarch64_sve_fmulx:
1369 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1370 case Intrinsic::aarch64_sve_fnmla:
1371 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1372 case Intrinsic::aarch64_sve_fnmls:
1373 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1374 case Intrinsic::aarch64_sve_fsub:
1375 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1376 .setMatchingIROpcode(Instruction::FSub);
1377 case Intrinsic::aarch64_sve_add:
1378 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1379 .setMatchingIROpcode(Instruction::Add);
1380 case Intrinsic::aarch64_sve_mla:
1381 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1382 case Intrinsic::aarch64_sve_mls:
1383 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1384 case Intrinsic::aarch64_sve_mul:
1385 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1386 .setMatchingIROpcode(Instruction::Mul);
1387 case Intrinsic::aarch64_sve_sabd:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1389 case Intrinsic::aarch64_sve_sdiv:
1390 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1391 .setMatchingIROpcode(Instruction::SDiv);
1392 case Intrinsic::aarch64_sve_smax:
1393 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1394 case Intrinsic::aarch64_sve_smin:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1396 case Intrinsic::aarch64_sve_smulh:
1397 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1398 case Intrinsic::aarch64_sve_sub:
1399 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1400 .setMatchingIROpcode(Instruction::Sub);
1401 case Intrinsic::aarch64_sve_uabd:
1402 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1403 case Intrinsic::aarch64_sve_udiv:
1404 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1405 .setMatchingIROpcode(Instruction::UDiv);
1406 case Intrinsic::aarch64_sve_umax:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1408 case Intrinsic::aarch64_sve_umin:
1409 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1410 case Intrinsic::aarch64_sve_umulh:
1411 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1412 case Intrinsic::aarch64_sve_asr:
1413 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1414 .setMatchingIROpcode(Instruction::AShr);
1415 case Intrinsic::aarch64_sve_lsl:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1417 .setMatchingIROpcode(Instruction::Shl);
1418 case Intrinsic::aarch64_sve_lsr:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1420 .setMatchingIROpcode(Instruction::LShr);
1421 case Intrinsic::aarch64_sve_and:
1422 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1423 .setMatchingIROpcode(Instruction::And);
1424 case Intrinsic::aarch64_sve_bic:
1425 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1426 case Intrinsic::aarch64_sve_eor:
1427 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1428 .setMatchingIROpcode(Instruction::Xor);
1429 case Intrinsic::aarch64_sve_orr:
1430 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1431 .setMatchingIROpcode(Instruction::Or);
1432 case Intrinsic::aarch64_sve_sqsub:
1433 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1434 case Intrinsic::aarch64_sve_uqsub:
1435 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1436
1437 case Intrinsic::aarch64_sve_add_u:
1439 Instruction::Add);
1440 case Intrinsic::aarch64_sve_and_u:
1442 Instruction::And);
1443 case Intrinsic::aarch64_sve_asr_u:
1445 Instruction::AShr);
1446 case Intrinsic::aarch64_sve_eor_u:
1448 Instruction::Xor);
1449 case Intrinsic::aarch64_sve_fadd_u:
1451 Instruction::FAdd);
1452 case Intrinsic::aarch64_sve_fdiv_u:
1454 Instruction::FDiv);
1455 case Intrinsic::aarch64_sve_fmul_u:
1457 Instruction::FMul);
1458 case Intrinsic::aarch64_sve_fsub_u:
1460 Instruction::FSub);
1461 case Intrinsic::aarch64_sve_lsl_u:
1463 Instruction::Shl);
1464 case Intrinsic::aarch64_sve_lsr_u:
1466 Instruction::LShr);
1467 case Intrinsic::aarch64_sve_mul_u:
1469 Instruction::Mul);
1470 case Intrinsic::aarch64_sve_orr_u:
1472 Instruction::Or);
1473 case Intrinsic::aarch64_sve_sdiv_u:
1475 Instruction::SDiv);
1476 case Intrinsic::aarch64_sve_sub_u:
1478 Instruction::Sub);
1479 case Intrinsic::aarch64_sve_udiv_u:
1481 Instruction::UDiv);
1482
1483 case Intrinsic::aarch64_sve_addqv:
1484 case Intrinsic::aarch64_sve_and_z:
1485 case Intrinsic::aarch64_sve_bic_z:
1486 case Intrinsic::aarch64_sve_brka_z:
1487 case Intrinsic::aarch64_sve_brkb_z:
1488 case Intrinsic::aarch64_sve_brkn_z:
1489 case Intrinsic::aarch64_sve_brkpa_z:
1490 case Intrinsic::aarch64_sve_brkpb_z:
1491 case Intrinsic::aarch64_sve_cntp:
1492 case Intrinsic::aarch64_sve_compact:
1493 case Intrinsic::aarch64_sve_eor_z:
1494 case Intrinsic::aarch64_sve_eorv:
1495 case Intrinsic::aarch64_sve_eorqv:
1496 case Intrinsic::aarch64_sve_nand_z:
1497 case Intrinsic::aarch64_sve_nor_z:
1498 case Intrinsic::aarch64_sve_orn_z:
1499 case Intrinsic::aarch64_sve_orr_z:
1500 case Intrinsic::aarch64_sve_orv:
1501 case Intrinsic::aarch64_sve_orqv:
1502 case Intrinsic::aarch64_sve_pnext:
1503 case Intrinsic::aarch64_sve_rdffr_z:
1504 case Intrinsic::aarch64_sve_saddv:
1505 case Intrinsic::aarch64_sve_uaddv:
1506 case Intrinsic::aarch64_sve_umaxv:
1507 case Intrinsic::aarch64_sve_umaxqv:
1508 case Intrinsic::aarch64_sve_cmpeq:
1509 case Intrinsic::aarch64_sve_cmpeq_wide:
1510 case Intrinsic::aarch64_sve_cmpge:
1511 case Intrinsic::aarch64_sve_cmpge_wide:
1512 case Intrinsic::aarch64_sve_cmpgt:
1513 case Intrinsic::aarch64_sve_cmpgt_wide:
1514 case Intrinsic::aarch64_sve_cmphi:
1515 case Intrinsic::aarch64_sve_cmphi_wide:
1516 case Intrinsic::aarch64_sve_cmphs:
1517 case Intrinsic::aarch64_sve_cmphs_wide:
1518 case Intrinsic::aarch64_sve_cmple_wide:
1519 case Intrinsic::aarch64_sve_cmplo_wide:
1520 case Intrinsic::aarch64_sve_cmpls_wide:
1521 case Intrinsic::aarch64_sve_cmplt_wide:
1522 case Intrinsic::aarch64_sve_cmpne:
1523 case Intrinsic::aarch64_sve_cmpne_wide:
1524 case Intrinsic::aarch64_sve_facge:
1525 case Intrinsic::aarch64_sve_facgt:
1526 case Intrinsic::aarch64_sve_fcmpeq:
1527 case Intrinsic::aarch64_sve_fcmpge:
1528 case Intrinsic::aarch64_sve_fcmpgt:
1529 case Intrinsic::aarch64_sve_fcmpne:
1530 case Intrinsic::aarch64_sve_fcmpuo:
1531 case Intrinsic::aarch64_sve_ld1:
1532 case Intrinsic::aarch64_sve_ld1_gather:
1533 case Intrinsic::aarch64_sve_ld1_gather_index:
1534 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1535 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1536 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1537 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1538 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1539 case Intrinsic::aarch64_sve_ld1q_gather_index:
1540 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1541 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1542 case Intrinsic::aarch64_sve_ld1ro:
1543 case Intrinsic::aarch64_sve_ld1rq:
1544 case Intrinsic::aarch64_sve_ld1udq:
1545 case Intrinsic::aarch64_sve_ld1uwq:
1546 case Intrinsic::aarch64_sve_ld2_sret:
1547 case Intrinsic::aarch64_sve_ld2q_sret:
1548 case Intrinsic::aarch64_sve_ld3_sret:
1549 case Intrinsic::aarch64_sve_ld3q_sret:
1550 case Intrinsic::aarch64_sve_ld4_sret:
1551 case Intrinsic::aarch64_sve_ld4q_sret:
1552 case Intrinsic::aarch64_sve_ldff1:
1553 case Intrinsic::aarch64_sve_ldff1_gather:
1554 case Intrinsic::aarch64_sve_ldff1_gather_index:
1555 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1556 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1557 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1558 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1559 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1560 case Intrinsic::aarch64_sve_ldnf1:
1561 case Intrinsic::aarch64_sve_ldnt1:
1562 case Intrinsic::aarch64_sve_ldnt1_gather:
1563 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1564 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1565 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1567
1568 case Intrinsic::aarch64_sve_prf:
1569 case Intrinsic::aarch64_sve_prfb_gather_index:
1570 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1571 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1572 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1573 case Intrinsic::aarch64_sve_prfd_gather_index:
1574 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1575 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1576 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1577 case Intrinsic::aarch64_sve_prfh_gather_index:
1578 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1579 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1580 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1581 case Intrinsic::aarch64_sve_prfw_gather_index:
1582 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1583 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1584 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1586
1587 case Intrinsic::aarch64_sve_st1_scatter:
1588 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1589 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1590 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1591 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1592 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1593 case Intrinsic::aarch64_sve_st1dq:
1594 case Intrinsic::aarch64_sve_st1q_scatter_index:
1595 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1596 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1597 case Intrinsic::aarch64_sve_st1wq:
1598 case Intrinsic::aarch64_sve_stnt1:
1599 case Intrinsic::aarch64_sve_stnt1_scatter:
1600 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1601 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1602 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1604 case Intrinsic::aarch64_sve_st2:
1605 case Intrinsic::aarch64_sve_st2q:
1607 case Intrinsic::aarch64_sve_st3:
1608 case Intrinsic::aarch64_sve_st3q:
1610 case Intrinsic::aarch64_sve_st4:
1611 case Intrinsic::aarch64_sve_st4q:
1613 }
1614
1615 return SVEIntrinsicInfo();
1616}
1617
1618static bool isAllActivePredicate(Value *Pred) {
1619 Value *UncastedPred;
1620
1621 // Look through predicate casts that only remove lanes.
1623 m_Value(UncastedPred)))) {
1624 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1625 Pred = UncastedPred;
1626
1628 m_Value(UncastedPred))))
1629 // If the predicate has the same or less lanes than the uncasted predicate
1630 // then we know the casting has no effect.
1631 if (OrigPredTy->getMinNumElements() <=
1632 cast<ScalableVectorType>(UncastedPred->getType())
1633 ->getMinNumElements())
1634 Pred = UncastedPred;
1635 }
1636
1637 auto *C = dyn_cast<Constant>(Pred);
1638 return C && C->isAllOnesValue();
1639}
1640
1641// Simplify `V` by only considering the operations that affect active lanes.
1642// This function should only return existing Values or newly created Constants.
1643static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1644 auto *Dup = dyn_cast<IntrinsicInst>(V);
1645 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1646 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1648 cast<VectorType>(V->getType())->getElementCount(),
1649 cast<Constant>(Dup->getOperand(2)));
1650
1651 return V;
1652}
1653
1654static std::optional<Instruction *>
1656 const SVEIntrinsicInfo &IInfo) {
1657 const unsigned Opc = IInfo.getMatchingIROpode();
1658 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1659
1660 Value *Pg = II.getOperand(0);
1661 Value *Op1 = II.getOperand(1);
1662 Value *Op2 = II.getOperand(2);
1663 const DataLayout &DL = II.getDataLayout();
1664
1665 // Canonicalise constants to the RHS.
1667 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1668 IC.replaceOperand(II, 1, Op2);
1669 IC.replaceOperand(II, 2, Op1);
1670 return &II;
1671 }
1672
1673 // Only active lanes matter when simplifying the operation.
1674 Op1 = stripInactiveLanes(Op1, Pg);
1675 Op2 = stripInactiveLanes(Op2, Pg);
1676
1677 Value *SimpleII;
1678 if (auto FII = dyn_cast<FPMathOperator>(&II))
1679 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1680 else
1681 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1682
1683 // An SVE intrinsic's result is always defined. However, this is not the case
1684 // for its equivalent IR instruction (e.g. when shifting by an amount more
1685 // than the data's bitwidth). Simplifications to an undefined result must be
1686 // ignored to preserve the intrinsic's expected behaviour.
1687 if (!SimpleII || isa<UndefValue>(SimpleII))
1688 return std::nullopt;
1689
1690 if (IInfo.inactiveLanesAreNotDefined())
1691 return IC.replaceInstUsesWith(II, SimpleII);
1692
1693 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1694
1695 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1696 if (SimpleII == Inactive)
1697 return IC.replaceInstUsesWith(II, SimpleII);
1698
1699 // Inactive lanes must be preserved.
1700 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1701 return IC.replaceInstUsesWith(II, SimpleII);
1702}
1703
1704// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1705// to operations with less strict inactive lane requirements.
1706static std::optional<Instruction *>
1708 const SVEIntrinsicInfo &IInfo) {
1709 if (!IInfo.hasGoverningPredicate())
1710 return std::nullopt;
1711
1712 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1713
1714 // If there are no active lanes.
1715 if (match(OpPredicate, m_ZeroInt())) {
1717 return IC.replaceInstUsesWith(
1718 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1719
1720 if (IInfo.inactiveLanesAreUnused()) {
1721 if (IInfo.resultIsZeroInitialized())
1723
1724 return IC.eraseInstFromFunction(II);
1725 }
1726 }
1727
1728 // If there are no inactive lanes.
1729 if (isAllActivePredicate(OpPredicate)) {
1730 if (IInfo.hasOperandWithNoActiveLanes()) {
1731 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1732 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1733 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1734 }
1735
1736 if (IInfo.hasMatchingUndefIntrinsic()) {
1737 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1738 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1739 II.setCalledFunction(NewDecl);
1740 return &II;
1741 }
1742 }
1743
1744 // Operation specific simplifications.
1745 if (IInfo.hasMatchingIROpode() &&
1747 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1748
1749 return std::nullopt;
1750}
1751
1752// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1753// => (binop (pred) (from_svbool _) (from_svbool _))
1754//
1755// The above transformation eliminates a `to_svbool` in the predicate
1756// operand of bitwise operation `binop` by narrowing the vector width of
1757// the operation. For example, it would convert a `<vscale x 16 x i1>
1758// and` into a `<vscale x 4 x i1> and`. This is profitable because
1759// to_svbool must zero the new lanes during widening, whereas
1760// from_svbool is free.
1761static std::optional<Instruction *>
1763 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1764 if (!BinOp)
1765 return std::nullopt;
1766
1767 auto IntrinsicID = BinOp->getIntrinsicID();
1768 switch (IntrinsicID) {
1769 case Intrinsic::aarch64_sve_and_z:
1770 case Intrinsic::aarch64_sve_bic_z:
1771 case Intrinsic::aarch64_sve_eor_z:
1772 case Intrinsic::aarch64_sve_nand_z:
1773 case Intrinsic::aarch64_sve_nor_z:
1774 case Intrinsic::aarch64_sve_orn_z:
1775 case Intrinsic::aarch64_sve_orr_z:
1776 break;
1777 default:
1778 return std::nullopt;
1779 }
1780
1781 auto BinOpPred = BinOp->getOperand(0);
1782 auto BinOpOp1 = BinOp->getOperand(1);
1783 auto BinOpOp2 = BinOp->getOperand(2);
1784
1785 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1786 if (!PredIntr ||
1787 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1788 return std::nullopt;
1789
1790 auto PredOp = PredIntr->getOperand(0);
1791 auto PredOpTy = cast<VectorType>(PredOp->getType());
1792 if (PredOpTy != II.getType())
1793 return std::nullopt;
1794
1795 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1796 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1797 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1798 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1799 if (BinOpOp1 == BinOpOp2)
1800 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1801 else
1802 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1803 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1804
1805 auto NarrowedBinOp =
1806 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1807 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1808}
1809
1810static std::optional<Instruction *>
1812 // If the reinterpret instruction operand is a PHI Node
1813 if (isa<PHINode>(II.getArgOperand(0)))
1814 return processPhiNode(IC, II);
1815
1816 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1817 return BinOpCombine;
1818
1819 // Ignore converts to/from svcount_t.
1820 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1821 isa<TargetExtType>(II.getType()))
1822 return std::nullopt;
1823
1824 SmallVector<Instruction *, 32> CandidatesForRemoval;
1825 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1826
1827 const auto *IVTy = cast<VectorType>(II.getType());
1828
1829 // Walk the chain of conversions.
1830 while (Cursor) {
1831 // If the type of the cursor has fewer lanes than the final result, zeroing
1832 // must take place, which breaks the equivalence chain.
1833 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1834 if (CursorVTy->getElementCount().getKnownMinValue() <
1835 IVTy->getElementCount().getKnownMinValue())
1836 break;
1837
1838 // If the cursor has the same type as I, it is a viable replacement.
1839 if (Cursor->getType() == IVTy)
1840 EarliestReplacement = Cursor;
1841
1842 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1843
1844 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1845 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1846 Intrinsic::aarch64_sve_convert_to_svbool ||
1847 IntrinsicCursor->getIntrinsicID() ==
1848 Intrinsic::aarch64_sve_convert_from_svbool))
1849 break;
1850
1851 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1852 Cursor = IntrinsicCursor->getOperand(0);
1853 }
1854
1855 // If no viable replacement in the conversion chain was found, there is
1856 // nothing to do.
1857 if (!EarliestReplacement)
1858 return std::nullopt;
1859
1860 return IC.replaceInstUsesWith(II, EarliestReplacement);
1861}
1862
1863static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1864 IntrinsicInst &II) {
1865 // svsel(ptrue, x, y) => x
1866 auto *OpPredicate = II.getOperand(0);
1867 if (isAllActivePredicate(OpPredicate))
1868 return IC.replaceInstUsesWith(II, II.getOperand(1));
1869
1870 auto Select =
1871 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1872 return IC.replaceInstUsesWith(II, Select);
1873}
1874
1875static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1876 IntrinsicInst &II) {
1877 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1878 if (!Pg)
1879 return std::nullopt;
1880
1881 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1882 return std::nullopt;
1883
1884 const auto PTruePattern =
1885 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1886 if (PTruePattern != AArch64SVEPredPattern::vl1)
1887 return std::nullopt;
1888
1889 // The intrinsic is inserting into lane zero so use an insert instead.
1890 auto *IdxTy = Type::getInt64Ty(II.getContext());
1891 auto *Insert = InsertElementInst::Create(
1892 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1893 Insert->insertBefore(II.getIterator());
1894 Insert->takeName(&II);
1895
1896 return IC.replaceInstUsesWith(II, Insert);
1897}
1898
1899static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1900 IntrinsicInst &II) {
1901 // Replace DupX with a regular IR splat.
1902 auto *RetTy = cast<ScalableVectorType>(II.getType());
1903 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1904 II.getArgOperand(0));
1905 Splat->takeName(&II);
1906 return IC.replaceInstUsesWith(II, Splat);
1907}
1908
1909static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1910 IntrinsicInst &II) {
1911 LLVMContext &Ctx = II.getContext();
1912
1913 if (!isAllActivePredicate(II.getArgOperand(0)))
1914 return std::nullopt;
1915
1916 // Check that we have a compare of zero..
1917 auto *SplatValue =
1919 if (!SplatValue || !SplatValue->isZero())
1920 return std::nullopt;
1921
1922 // ..against a dupq
1923 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1924 if (!DupQLane ||
1925 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1926 return std::nullopt;
1927
1928 // Where the dupq is a lane 0 replicate of a vector insert
1929 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1930 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1931 return std::nullopt;
1932
1933 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1934 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1935 return std::nullopt;
1936
1937 // Where the vector insert is a fixed constant vector insert into undef at
1938 // index zero
1939 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1940 return std::nullopt;
1941
1942 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1943 return std::nullopt;
1944
1945 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1946 if (!ConstVec)
1947 return std::nullopt;
1948
1949 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1950 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1951 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1952 return std::nullopt;
1953
1954 unsigned NumElts = VecTy->getNumElements();
1955 unsigned PredicateBits = 0;
1956
1957 // Expand intrinsic operands to a 16-bit byte level predicate
1958 for (unsigned I = 0; I < NumElts; ++I) {
1959 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1960 if (!Arg)
1961 return std::nullopt;
1962 if (!Arg->isZero())
1963 PredicateBits |= 1 << (I * (16 / NumElts));
1964 }
1965
1966 // If all bits are zero bail early with an empty predicate
1967 if (PredicateBits == 0) {
1968 auto *PFalse = Constant::getNullValue(II.getType());
1969 PFalse->takeName(&II);
1970 return IC.replaceInstUsesWith(II, PFalse);
1971 }
1972
1973 // Calculate largest predicate type used (where byte predicate is largest)
1974 unsigned Mask = 8;
1975 for (unsigned I = 0; I < 16; ++I)
1976 if ((PredicateBits & (1 << I)) != 0)
1977 Mask |= (I % 8);
1978
1979 unsigned PredSize = Mask & -Mask;
1980 auto *PredType = ScalableVectorType::get(
1981 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1982
1983 // Ensure all relevant bits are set
1984 for (unsigned I = 0; I < 16; I += PredSize)
1985 if ((PredicateBits & (1 << I)) == 0)
1986 return std::nullopt;
1987
1988 auto *PTruePat =
1989 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1990 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1991 {PredType}, {PTruePat});
1992 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1993 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1994 auto *ConvertFromSVBool =
1995 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1996 {II.getType()}, {ConvertToSVBool});
1997
1998 ConvertFromSVBool->takeName(&II);
1999 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2000}
2001
2002static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2003 IntrinsicInst &II) {
2004 Value *Pg = II.getArgOperand(0);
2005 Value *Vec = II.getArgOperand(1);
2006 auto IntrinsicID = II.getIntrinsicID();
2007 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2008
2009 // lastX(splat(X)) --> X
2010 if (auto *SplatVal = getSplatValue(Vec))
2011 return IC.replaceInstUsesWith(II, SplatVal);
2012
2013 // If x and/or y is a splat value then:
2014 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2015 Value *LHS, *RHS;
2016 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2017 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2018 auto *OldBinOp = cast<BinaryOperator>(Vec);
2019 auto OpC = OldBinOp->getOpcode();
2020 auto *NewLHS =
2021 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2022 auto *NewRHS =
2023 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2025 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2026 return IC.replaceInstUsesWith(II, NewBinOp);
2027 }
2028 }
2029
2030 auto *C = dyn_cast<Constant>(Pg);
2031 if (IsAfter && C && C->isNullValue()) {
2032 // The intrinsic is extracting lane 0 so use an extract instead.
2033 auto *IdxTy = Type::getInt64Ty(II.getContext());
2034 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2035 Extract->insertBefore(II.getIterator());
2036 Extract->takeName(&II);
2037 return IC.replaceInstUsesWith(II, Extract);
2038 }
2039
2040 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2041 if (!IntrPG)
2042 return std::nullopt;
2043
2044 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2045 return std::nullopt;
2046
2047 const auto PTruePattern =
2048 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2049
2050 // Can the intrinsic's predicate be converted to a known constant index?
2051 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2052 if (!MinNumElts)
2053 return std::nullopt;
2054
2055 unsigned Idx = MinNumElts - 1;
2056 // Increment the index if extracting the element after the last active
2057 // predicate element.
2058 if (IsAfter)
2059 ++Idx;
2060
2061 // Ignore extracts whose index is larger than the known minimum vector
2062 // length. NOTE: This is an artificial constraint where we prefer to
2063 // maintain what the user asked for until an alternative is proven faster.
2064 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2065 if (Idx >= PgVTy->getMinNumElements())
2066 return std::nullopt;
2067
2068 // The intrinsic is extracting a fixed lane so use an extract instead.
2069 auto *IdxTy = Type::getInt64Ty(II.getContext());
2070 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2071 Extract->insertBefore(II.getIterator());
2072 Extract->takeName(&II);
2073 return IC.replaceInstUsesWith(II, Extract);
2074}
2075
2076static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2077 IntrinsicInst &II) {
2078 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2079 // integer variant across a variety of micro-architectures. Replace scalar
2080 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2081 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2082 // depending on the micro-architecture, but has been observed as generally
2083 // being faster, particularly when the CLAST[AB] op is a loop-carried
2084 // dependency.
2085 Value *Pg = II.getArgOperand(0);
2086 Value *Fallback = II.getArgOperand(1);
2087 Value *Vec = II.getArgOperand(2);
2088 Type *Ty = II.getType();
2089
2090 if (!Ty->isIntegerTy())
2091 return std::nullopt;
2092
2093 Type *FPTy;
2094 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2095 default:
2096 return std::nullopt;
2097 case 16:
2098 FPTy = IC.Builder.getHalfTy();
2099 break;
2100 case 32:
2101 FPTy = IC.Builder.getFloatTy();
2102 break;
2103 case 64:
2104 FPTy = IC.Builder.getDoubleTy();
2105 break;
2106 }
2107
2108 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2109 auto *FPVTy = VectorType::get(
2110 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2111 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2112 auto *FPII = IC.Builder.CreateIntrinsic(
2113 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2114 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2115 return IC.replaceInstUsesWith(II, FPIItoInt);
2116}
2117
2118static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2119 IntrinsicInst &II) {
2120 LLVMContext &Ctx = II.getContext();
2121 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2122 // can work with RDFFR_PP for ptest elimination.
2123 auto *AllPat =
2124 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2125 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2126 {II.getType()}, {AllPat});
2127 auto *RDFFR =
2128 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2129 RDFFR->takeName(&II);
2130 return IC.replaceInstUsesWith(II, RDFFR);
2131}
2132
2133static std::optional<Instruction *>
2135 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2136
2137 if (Pattern == AArch64SVEPredPattern::all) {
2139 II.getType(), ElementCount::getScalable(NumElts));
2140 Cnt->takeName(&II);
2141 return IC.replaceInstUsesWith(II, Cnt);
2142 }
2143
2144 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2145
2146 return MinNumElts && NumElts >= MinNumElts
2147 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2148 II, ConstantInt::get(II.getType(), MinNumElts)))
2149 : std::nullopt;
2150}
2151
2152static std::optional<Instruction *>
2154 const AArch64Subtarget *ST) {
2155 if (!ST->isStreaming())
2156 return std::nullopt;
2157
2158 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2159 // with SVEPredPattern::all
2160 Value *Cnt =
2162 Cnt->takeName(&II);
2163 return IC.replaceInstUsesWith(II, Cnt);
2164}
2165
2166static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2167 IntrinsicInst &II) {
2168 Value *PgVal = II.getArgOperand(0);
2169 Value *OpVal = II.getArgOperand(1);
2170
2171 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2172 // Later optimizations prefer this form.
2173 if (PgVal == OpVal &&
2174 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2175 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2176 Value *Ops[] = {PgVal, OpVal};
2177 Type *Tys[] = {PgVal->getType()};
2178
2179 auto *PTest =
2180 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2181 PTest->takeName(&II);
2182
2183 return IC.replaceInstUsesWith(II, PTest);
2184 }
2185
2188
2189 if (!Pg || !Op)
2190 return std::nullopt;
2191
2192 Intrinsic::ID OpIID = Op->getIntrinsicID();
2193
2194 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2195 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2196 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2197 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2198 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2199
2200 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2201
2202 PTest->takeName(&II);
2203 return IC.replaceInstUsesWith(II, PTest);
2204 }
2205
2206 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2207 // Later optimizations may rewrite sequence to use the flag-setting variant
2208 // of instruction X to remove PTEST.
2209 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2210 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2211 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2212 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2213 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2214 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2215 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2216 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2217 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2218 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2219 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2220 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2221 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2222 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2223 Type *Tys[] = {Pg->getType()};
2224
2225 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2226 PTest->takeName(&II);
2227
2228 return IC.replaceInstUsesWith(II, PTest);
2229 }
2230
2231 return std::nullopt;
2232}
2233
2234template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2235static std::optional<Instruction *>
2237 bool MergeIntoAddendOp) {
2238 Value *P = II.getOperand(0);
2239 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2240 if (MergeIntoAddendOp) {
2241 AddendOp = II.getOperand(1);
2242 Mul = II.getOperand(2);
2243 } else {
2244 AddendOp = II.getOperand(2);
2245 Mul = II.getOperand(1);
2246 }
2247
2249 m_Value(MulOp1))))
2250 return std::nullopt;
2251
2252 if (!Mul->hasOneUse())
2253 return std::nullopt;
2254
2255 Instruction *FMFSource = nullptr;
2256 if (II.getType()->isFPOrFPVectorTy()) {
2257 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2258 // Stop the combine when the flags on the inputs differ in case dropping
2259 // flags would lead to us missing out on more beneficial optimizations.
2260 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2261 return std::nullopt;
2262 if (!FAddFlags.allowContract())
2263 return std::nullopt;
2264 FMFSource = &II;
2265 }
2266
2267 CallInst *Res;
2268 if (MergeIntoAddendOp)
2269 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2270 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2271 else
2272 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2273 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2274
2275 return IC.replaceInstUsesWith(II, Res);
2276}
2277
2278static std::optional<Instruction *>
2280 Value *Pred = II.getOperand(0);
2281 Value *PtrOp = II.getOperand(1);
2282 Type *VecTy = II.getType();
2283
2284 if (isAllActivePredicate(Pred)) {
2285 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2286 Load->copyMetadata(II);
2287 return IC.replaceInstUsesWith(II, Load);
2288 }
2289
2290 CallInst *MaskedLoad =
2291 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2292 Pred, ConstantAggregateZero::get(VecTy));
2293 MaskedLoad->copyMetadata(II);
2294 return IC.replaceInstUsesWith(II, MaskedLoad);
2295}
2296
2297static std::optional<Instruction *>
2299 Value *VecOp = II.getOperand(0);
2300 Value *Pred = II.getOperand(1);
2301 Value *PtrOp = II.getOperand(2);
2302
2303 if (isAllActivePredicate(Pred)) {
2304 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2305 Store->copyMetadata(II);
2306 return IC.eraseInstFromFunction(II);
2307 }
2308
2309 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2310 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2311 MaskedStore->copyMetadata(II);
2312 return IC.eraseInstFromFunction(II);
2313}
2314
2316 switch (Intrinsic) {
2317 case Intrinsic::aarch64_sve_fmul_u:
2318 return Instruction::BinaryOps::FMul;
2319 case Intrinsic::aarch64_sve_fadd_u:
2320 return Instruction::BinaryOps::FAdd;
2321 case Intrinsic::aarch64_sve_fsub_u:
2322 return Instruction::BinaryOps::FSub;
2323 default:
2324 return Instruction::BinaryOpsEnd;
2325 }
2326}
2327
2328static std::optional<Instruction *>
2330 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2331 if (II.isStrictFP())
2332 return std::nullopt;
2333
2334 auto *OpPredicate = II.getOperand(0);
2335 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2336 if (BinOpCode == Instruction::BinaryOpsEnd ||
2337 !isAllActivePredicate(OpPredicate))
2338 return std::nullopt;
2339 auto BinOp = IC.Builder.CreateBinOpFMF(
2340 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2341 return IC.replaceInstUsesWith(II, BinOp);
2342}
2343
2344static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2345 IntrinsicInst &II) {
2346 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2347 Intrinsic::aarch64_sve_mla>(
2348 IC, II, true))
2349 return MLA;
2350 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2351 Intrinsic::aarch64_sve_mad>(
2352 IC, II, false))
2353 return MAD;
2354 return std::nullopt;
2355}
2356
2357static std::optional<Instruction *>
2359 if (auto FMLA =
2360 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2361 Intrinsic::aarch64_sve_fmla>(IC, II,
2362 true))
2363 return FMLA;
2364 if (auto FMAD =
2365 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2366 Intrinsic::aarch64_sve_fmad>(IC, II,
2367 false))
2368 return FMAD;
2369 if (auto FMLA =
2370 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2371 Intrinsic::aarch64_sve_fmla>(IC, II,
2372 true))
2373 return FMLA;
2374 return std::nullopt;
2375}
2376
2377static std::optional<Instruction *>
2379 if (auto FMLA =
2380 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2381 Intrinsic::aarch64_sve_fmla>(IC, II,
2382 true))
2383 return FMLA;
2384 if (auto FMAD =
2385 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2386 Intrinsic::aarch64_sve_fmad>(IC, II,
2387 false))
2388 return FMAD;
2389 if (auto FMLA_U =
2390 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2391 Intrinsic::aarch64_sve_fmla_u>(
2392 IC, II, true))
2393 return FMLA_U;
2394 return instCombineSVEVectorBinOp(IC, II);
2395}
2396
2397static std::optional<Instruction *>
2399 if (auto FMLS =
2400 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2401 Intrinsic::aarch64_sve_fmls>(IC, II,
2402 true))
2403 return FMLS;
2404 if (auto FMSB =
2405 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2406 Intrinsic::aarch64_sve_fnmsb>(
2407 IC, II, false))
2408 return FMSB;
2409 if (auto FMLS =
2410 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2411 Intrinsic::aarch64_sve_fmls>(IC, II,
2412 true))
2413 return FMLS;
2414 return std::nullopt;
2415}
2416
2417static std::optional<Instruction *>
2419 if (auto FMLS =
2420 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2421 Intrinsic::aarch64_sve_fmls>(IC, II,
2422 true))
2423 return FMLS;
2424 if (auto FMSB =
2425 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2426 Intrinsic::aarch64_sve_fnmsb>(
2427 IC, II, false))
2428 return FMSB;
2429 if (auto FMLS_U =
2430 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2431 Intrinsic::aarch64_sve_fmls_u>(
2432 IC, II, true))
2433 return FMLS_U;
2434 return instCombineSVEVectorBinOp(IC, II);
2435}
2436
2437static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2438 IntrinsicInst &II) {
2439 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2440 Intrinsic::aarch64_sve_mls>(
2441 IC, II, true))
2442 return MLS;
2443 return std::nullopt;
2444}
2445
2446static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2447 IntrinsicInst &II) {
2448 Value *UnpackArg = II.getArgOperand(0);
2449 auto *RetTy = cast<ScalableVectorType>(II.getType());
2450 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2451 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2452
2453 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2454 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2455 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2456 ScalarArg =
2457 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2458 Value *NewVal =
2459 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2460 NewVal->takeName(&II);
2461 return IC.replaceInstUsesWith(II, NewVal);
2462 }
2463
2464 return std::nullopt;
2465}
2466static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2467 IntrinsicInst &II) {
2468 auto *OpVal = II.getOperand(0);
2469 auto *OpIndices = II.getOperand(1);
2470 VectorType *VTy = cast<VectorType>(II.getType());
2471
2472 // Check whether OpIndices is a constant splat value < minimal element count
2473 // of result.
2474 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2475 if (!SplatValue ||
2476 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2477 return std::nullopt;
2478
2479 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2480 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2481 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2482 auto *VectorSplat =
2483 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2484
2485 VectorSplat->takeName(&II);
2486 return IC.replaceInstUsesWith(II, VectorSplat);
2487}
2488
2489static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2490 IntrinsicInst &II) {
2491 Value *A, *B;
2492 Type *RetTy = II.getType();
2493 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2494 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2495
2496 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2497 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2498 if ((match(II.getArgOperand(0),
2500 match(II.getArgOperand(1),
2502 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2503 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2504 auto *TyA = cast<ScalableVectorType>(A->getType());
2505 if (TyA == B->getType() &&
2507 auto *SubVec = IC.Builder.CreateInsertVector(
2508 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2509 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2510 TyA->getMinNumElements());
2511 ConcatVec->takeName(&II);
2512 return IC.replaceInstUsesWith(II, ConcatVec);
2513 }
2514 }
2515
2516 return std::nullopt;
2517}
2518
2519static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2520 IntrinsicInst &II) {
2521 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2522 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2523 Value *A, *B;
2524 if (match(II.getArgOperand(0),
2527 m_Specific(A), m_Specific(B))))
2528 return IC.replaceInstUsesWith(
2529 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2530
2531 return std::nullopt;
2532}
2533
2534static std::optional<Instruction *>
2536 Value *Mask = II.getOperand(0);
2537 Value *BasePtr = II.getOperand(1);
2538 Value *Index = II.getOperand(2);
2539 Type *Ty = II.getType();
2540 Value *PassThru = ConstantAggregateZero::get(Ty);
2541
2542 // Contiguous gather => masked load.
2543 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2544 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2545 Value *IndexBase;
2547 m_Value(IndexBase), m_SpecificInt(1)))) {
2548 Align Alignment =
2549 BasePtr->getPointerAlignment(II.getDataLayout());
2550
2551 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2552 BasePtr, IndexBase);
2553 CallInst *MaskedLoad =
2554 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2555 MaskedLoad->takeName(&II);
2556 return IC.replaceInstUsesWith(II, MaskedLoad);
2557 }
2558
2559 return std::nullopt;
2560}
2561
2562static std::optional<Instruction *>
2564 Value *Val = II.getOperand(0);
2565 Value *Mask = II.getOperand(1);
2566 Value *BasePtr = II.getOperand(2);
2567 Value *Index = II.getOperand(3);
2568 Type *Ty = Val->getType();
2569
2570 // Contiguous scatter => masked store.
2571 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2572 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2573 Value *IndexBase;
2575 m_Value(IndexBase), m_SpecificInt(1)))) {
2576 Align Alignment =
2577 BasePtr->getPointerAlignment(II.getDataLayout());
2578
2579 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2580 BasePtr, IndexBase);
2581 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2582
2583 return IC.eraseInstFromFunction(II);
2584 }
2585
2586 return std::nullopt;
2587}
2588
2589static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2590 IntrinsicInst &II) {
2592 Value *Pred = II.getOperand(0);
2593 Value *Vec = II.getOperand(1);
2594 Value *DivVec = II.getOperand(2);
2595
2596 Value *SplatValue = getSplatValue(DivVec);
2597 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2598 if (!SplatConstantInt)
2599 return std::nullopt;
2600
2601 APInt Divisor = SplatConstantInt->getValue();
2602 const int64_t DivisorValue = Divisor.getSExtValue();
2603 if (DivisorValue == -1)
2604 return std::nullopt;
2605 if (DivisorValue == 1)
2606 IC.replaceInstUsesWith(II, Vec);
2607
2608 if (Divisor.isPowerOf2()) {
2609 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2610 auto ASRD = IC.Builder.CreateIntrinsic(
2611 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2612 return IC.replaceInstUsesWith(II, ASRD);
2613 }
2614 if (Divisor.isNegatedPowerOf2()) {
2615 Divisor.negate();
2616 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2617 auto ASRD = IC.Builder.CreateIntrinsic(
2618 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2619 auto NEG = IC.Builder.CreateIntrinsic(
2620 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2621 return IC.replaceInstUsesWith(II, NEG);
2622 }
2623
2624 return std::nullopt;
2625}
2626
2627bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2628 size_t VecSize = Vec.size();
2629 if (VecSize == 1)
2630 return true;
2631 if (!isPowerOf2_64(VecSize))
2632 return false;
2633 size_t HalfVecSize = VecSize / 2;
2634
2635 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2636 RHS != Vec.end(); LHS++, RHS++) {
2637 if (*LHS != nullptr && *RHS != nullptr) {
2638 if (*LHS == *RHS)
2639 continue;
2640 else
2641 return false;
2642 }
2643 if (!AllowPoison)
2644 return false;
2645 if (*LHS == nullptr && *RHS != nullptr)
2646 *LHS = *RHS;
2647 }
2648
2649 Vec.resize(HalfVecSize);
2650 SimplifyValuePattern(Vec, AllowPoison);
2651 return true;
2652}
2653
2654// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2655// to dupqlane(f64(C)) where C is A concatenated with B
2656static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2657 IntrinsicInst &II) {
2658 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2659 if (!match(II.getOperand(0),
2661 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2662 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2663 return std::nullopt;
2664 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2665
2666 // Insert the scalars into a container ordered by InsertElement index
2667 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2668 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2669 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2670 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2671 CurrentInsertElt = InsertElt->getOperand(0);
2672 }
2673
2674 bool AllowPoison =
2675 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2676 if (!SimplifyValuePattern(Elts, AllowPoison))
2677 return std::nullopt;
2678
2679 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2680 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2681 for (size_t I = 0; I < Elts.size(); I++) {
2682 if (Elts[I] == nullptr)
2683 continue;
2684 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2685 IC.Builder.getInt64(I));
2686 }
2687 if (InsertEltChain == nullptr)
2688 return std::nullopt;
2689
2690 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2691 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2692 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2693 // be narrowed back to the original type.
2694 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2695 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2696 IIScalableTy->getMinNumElements() /
2697 PatternWidth;
2698
2699 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2700 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2701 auto *WideShuffleMaskTy =
2702 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2703
2704 auto InsertSubvector = IC.Builder.CreateInsertVector(
2705 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2706 uint64_t(0));
2707 auto WideBitcast =
2708 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2709 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2710 auto WideShuffle = IC.Builder.CreateShuffleVector(
2711 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2712 auto NarrowBitcast =
2713 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2714
2715 return IC.replaceInstUsesWith(II, NarrowBitcast);
2716}
2717
2718static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2719 IntrinsicInst &II) {
2720 Value *A = II.getArgOperand(0);
2721 Value *B = II.getArgOperand(1);
2722 if (A == B)
2723 return IC.replaceInstUsesWith(II, A);
2724
2725 return std::nullopt;
2726}
2727
2728static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2729 IntrinsicInst &II) {
2730 Value *Pred = II.getOperand(0);
2731 Value *Vec = II.getOperand(1);
2732 Value *Shift = II.getOperand(2);
2733
2734 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2735 Value *AbsPred, *MergedValue;
2737 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2739 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2740
2741 return std::nullopt;
2742
2743 // Transform is valid if any of the following are true:
2744 // * The ABS merge value is an undef or non-negative
2745 // * The ABS predicate is all active
2746 // * The ABS predicate and the SRSHL predicates are the same
2747 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2748 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2749 return std::nullopt;
2750
2751 // Only valid when the shift amount is non-negative, otherwise the rounding
2752 // behaviour of SRSHL cannot be ignored.
2753 if (!match(Shift, m_NonNegative()))
2754 return std::nullopt;
2755
2756 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2757 {II.getType()}, {Pred, Vec, Shift});
2758
2759 return IC.replaceInstUsesWith(II, LSL);
2760}
2761
2762static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2763 IntrinsicInst &II) {
2764 Value *Vec = II.getOperand(0);
2765
2766 if (getSplatValue(Vec) == II.getOperand(1))
2767 return IC.replaceInstUsesWith(II, Vec);
2768
2769 return std::nullopt;
2770}
2771
2772static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2773 IntrinsicInst &II) {
2774 // If this barrier is post-dominated by identical one we can remove it
2775 auto *NI = II.getNextNode();
2776 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2777 auto CanSkipOver = [](Instruction *I) {
2778 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2779 };
2780 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2781 auto *NIBB = NI->getParent();
2782 NI = NI->getNextNode();
2783 if (!NI) {
2784 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2785 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2786 else
2787 break;
2788 }
2789 }
2790 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2791 if (NextII && II.isIdenticalTo(NextII))
2792 return IC.eraseInstFromFunction(II);
2793
2794 return std::nullopt;
2795}
2796
2797static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2798 IntrinsicInst &II) {
2799 return IC.replaceInstUsesWith(
2800 II,
2801 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2802 {II.getType(), II.getOperand(0)->getType()},
2803 {II.getOperand(0), II.getOperand(1)}));
2804}
2805
2806static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2807 IntrinsicInst &II) {
2809 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2810 return std::nullopt;
2811}
2812
2813static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2815 unsigned NumBits) {
2816 Value *Passthru = II.getOperand(0);
2817 Value *Pg = II.getOperand(1);
2818 Value *Op = II.getOperand(2);
2819
2820 // Convert UXT[BHW] to AND.
2821 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2822 auto *Ty = cast<VectorType>(II.getType());
2823 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2824 auto *Mask = ConstantInt::get(Ty, MaskValue);
2825 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2826 {Pg, Op, Mask});
2827 return IC.replaceInstUsesWith(II, And);
2828 }
2829
2830 return std::nullopt;
2831}
2832
2833static std::optional<Instruction *>
2835 SMEAttrs FnSMEAttrs(*II.getFunction());
2836 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2837 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2838 return IC.replaceInstUsesWith(
2839 II, ConstantInt::getBool(II.getType(), IsStreaming));
2840 return std::nullopt;
2841}
2842
2843std::optional<Instruction *>
2845 IntrinsicInst &II) const {
2847 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2848 return I;
2849
2850 Intrinsic::ID IID = II.getIntrinsicID();
2851 switch (IID) {
2852 default:
2853 break;
2854 case Intrinsic::aarch64_dmb:
2855 return instCombineDMB(IC, II);
2856 case Intrinsic::aarch64_neon_fmaxnm:
2857 case Intrinsic::aarch64_neon_fminnm:
2858 return instCombineMaxMinNM(IC, II);
2859 case Intrinsic::aarch64_sve_convert_from_svbool:
2860 return instCombineConvertFromSVBool(IC, II);
2861 case Intrinsic::aarch64_sve_dup:
2862 return instCombineSVEDup(IC, II);
2863 case Intrinsic::aarch64_sve_dup_x:
2864 return instCombineSVEDupX(IC, II);
2865 case Intrinsic::aarch64_sve_cmpne:
2866 case Intrinsic::aarch64_sve_cmpne_wide:
2867 return instCombineSVECmpNE(IC, II);
2868 case Intrinsic::aarch64_sve_rdffr:
2869 return instCombineRDFFR(IC, II);
2870 case Intrinsic::aarch64_sve_lasta:
2871 case Intrinsic::aarch64_sve_lastb:
2872 return instCombineSVELast(IC, II);
2873 case Intrinsic::aarch64_sve_clasta_n:
2874 case Intrinsic::aarch64_sve_clastb_n:
2875 return instCombineSVECondLast(IC, II);
2876 case Intrinsic::aarch64_sve_cntd:
2877 return instCombineSVECntElts(IC, II, 2);
2878 case Intrinsic::aarch64_sve_cntw:
2879 return instCombineSVECntElts(IC, II, 4);
2880 case Intrinsic::aarch64_sve_cnth:
2881 return instCombineSVECntElts(IC, II, 8);
2882 case Intrinsic::aarch64_sve_cntb:
2883 return instCombineSVECntElts(IC, II, 16);
2884 case Intrinsic::aarch64_sme_cntsd:
2885 return instCombineSMECntsd(IC, II, ST);
2886 case Intrinsic::aarch64_sve_ptest_any:
2887 case Intrinsic::aarch64_sve_ptest_first:
2888 case Intrinsic::aarch64_sve_ptest_last:
2889 return instCombineSVEPTest(IC, II);
2890 case Intrinsic::aarch64_sve_fadd:
2891 return instCombineSVEVectorFAdd(IC, II);
2892 case Intrinsic::aarch64_sve_fadd_u:
2893 return instCombineSVEVectorFAddU(IC, II);
2894 case Intrinsic::aarch64_sve_fmul_u:
2895 return instCombineSVEVectorBinOp(IC, II);
2896 case Intrinsic::aarch64_sve_fsub:
2897 return instCombineSVEVectorFSub(IC, II);
2898 case Intrinsic::aarch64_sve_fsub_u:
2899 return instCombineSVEVectorFSubU(IC, II);
2900 case Intrinsic::aarch64_sve_add:
2901 return instCombineSVEVectorAdd(IC, II);
2902 case Intrinsic::aarch64_sve_add_u:
2903 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2904 Intrinsic::aarch64_sve_mla_u>(
2905 IC, II, true);
2906 case Intrinsic::aarch64_sve_sub:
2907 return instCombineSVEVectorSub(IC, II);
2908 case Intrinsic::aarch64_sve_sub_u:
2909 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2910 Intrinsic::aarch64_sve_mls_u>(
2911 IC, II, true);
2912 case Intrinsic::aarch64_sve_tbl:
2913 return instCombineSVETBL(IC, II);
2914 case Intrinsic::aarch64_sve_uunpkhi:
2915 case Intrinsic::aarch64_sve_uunpklo:
2916 case Intrinsic::aarch64_sve_sunpkhi:
2917 case Intrinsic::aarch64_sve_sunpklo:
2918 return instCombineSVEUnpack(IC, II);
2919 case Intrinsic::aarch64_sve_uzp1:
2920 return instCombineSVEUzp1(IC, II);
2921 case Intrinsic::aarch64_sve_zip1:
2922 case Intrinsic::aarch64_sve_zip2:
2923 return instCombineSVEZip(IC, II);
2924 case Intrinsic::aarch64_sve_ld1_gather_index:
2925 return instCombineLD1GatherIndex(IC, II);
2926 case Intrinsic::aarch64_sve_st1_scatter_index:
2927 return instCombineST1ScatterIndex(IC, II);
2928 case Intrinsic::aarch64_sve_ld1:
2929 return instCombineSVELD1(IC, II, DL);
2930 case Intrinsic::aarch64_sve_st1:
2931 return instCombineSVEST1(IC, II, DL);
2932 case Intrinsic::aarch64_sve_sdiv:
2933 return instCombineSVESDIV(IC, II);
2934 case Intrinsic::aarch64_sve_sel:
2935 return instCombineSVESel(IC, II);
2936 case Intrinsic::aarch64_sve_srshl:
2937 return instCombineSVESrshl(IC, II);
2938 case Intrinsic::aarch64_sve_dupq_lane:
2939 return instCombineSVEDupqLane(IC, II);
2940 case Intrinsic::aarch64_sve_insr:
2941 return instCombineSVEInsr(IC, II);
2942 case Intrinsic::aarch64_sve_whilelo:
2943 return instCombineWhilelo(IC, II);
2944 case Intrinsic::aarch64_sve_ptrue:
2945 return instCombinePTrue(IC, II);
2946 case Intrinsic::aarch64_sve_uxtb:
2947 return instCombineSVEUxt(IC, II, 8);
2948 case Intrinsic::aarch64_sve_uxth:
2949 return instCombineSVEUxt(IC, II, 16);
2950 case Intrinsic::aarch64_sve_uxtw:
2951 return instCombineSVEUxt(IC, II, 32);
2952 case Intrinsic::aarch64_sme_in_streaming_mode:
2953 return instCombineInStreamingMode(IC, II);
2954 }
2955
2956 return std::nullopt;
2957}
2958
2960 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2961 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2962 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2963 SimplifyAndSetOp) const {
2964 switch (II.getIntrinsicID()) {
2965 default:
2966 break;
2967 case Intrinsic::aarch64_neon_fcvtxn:
2968 case Intrinsic::aarch64_neon_rshrn:
2969 case Intrinsic::aarch64_neon_sqrshrn:
2970 case Intrinsic::aarch64_neon_sqrshrun:
2971 case Intrinsic::aarch64_neon_sqshrn:
2972 case Intrinsic::aarch64_neon_sqshrun:
2973 case Intrinsic::aarch64_neon_sqxtn:
2974 case Intrinsic::aarch64_neon_sqxtun:
2975 case Intrinsic::aarch64_neon_uqrshrn:
2976 case Intrinsic::aarch64_neon_uqshrn:
2977 case Intrinsic::aarch64_neon_uqxtn:
2978 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2979 break;
2980 }
2981
2982 return std::nullopt;
2983}
2984
2986 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2988}
2989
2992 switch (K) {
2994 return TypeSize::getFixed(64);
2996 if (ST->useSVEForFixedLengthVectors() &&
2997 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2998 return TypeSize::getFixed(
2999 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3000 else if (ST->isNeonAvailable())
3001 return TypeSize::getFixed(128);
3002 else
3003 return TypeSize::getFixed(0);
3005 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3007 return TypeSize::getScalable(128);
3008 else
3009 return TypeSize::getScalable(0);
3010 }
3011 llvm_unreachable("Unsupported register kind");
3012}
3013
3014bool AArch64TTIImpl::isSingleExtWideningInstruction(
3015 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3016 Type *SrcOverrideTy) const {
3017 // A helper that returns a vector type from the given type. The number of
3018 // elements in type Ty determines the vector width.
3019 auto toVectorTy = [&](Type *ArgTy) {
3020 return VectorType::get(ArgTy->getScalarType(),
3021 cast<VectorType>(DstTy)->getElementCount());
3022 };
3023
3024 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3025 // i32, i64]. SVE doesn't generally have the same set of instructions to
3026 // perform an extend with the add/sub/mul. There are SMULLB style
3027 // instructions, but they operate on top/bottom, requiring some sort of lane
3028 // interleaving to be used with zext/sext.
3029 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3030 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3031 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3032 return false;
3033
3034 Type *SrcTy = SrcOverrideTy;
3035 switch (Opcode) {
3036 case Instruction::Add: // UADDW(2), SADDW(2).
3037 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3038 // The second operand needs to be an extend
3039 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3040 if (!SrcTy)
3041 SrcTy =
3042 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3043 break;
3044 }
3045
3046 if (Opcode == Instruction::Sub)
3047 return false;
3048
3049 // UADDW(2), SADDW(2) can be commutted.
3050 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3051 if (!SrcTy)
3052 SrcTy =
3053 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3054 break;
3055 }
3056 return false;
3057 }
3058 default:
3059 return false;
3060 }
3061
3062 // Legalize the destination type and ensure it can be used in a widening
3063 // operation.
3064 auto DstTyL = getTypeLegalizationCost(DstTy);
3065 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3066 return false;
3067
3068 // Legalize the source type and ensure it can be used in a widening
3069 // operation.
3070 assert(SrcTy && "Expected some SrcTy");
3071 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3072 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3073 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3074 return false;
3075
3076 // Get the total number of vector elements in the legalized types.
3077 InstructionCost NumDstEls =
3078 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3079 InstructionCost NumSrcEls =
3080 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3081
3082 // Return true if the legalized types have the same number of vector elements
3083 // and the destination element type size is twice that of the source type.
3084 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3085}
3086
3087Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3089 Type *SrcOverrideTy) const {
3090 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3091 Opcode != Instruction::Mul)
3092 return nullptr;
3093
3094 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3095 // i32, i64]. SVE doesn't generally have the same set of instructions to
3096 // perform an extend with the add/sub/mul. There are SMULLB style
3097 // instructions, but they operate on top/bottom, requiring some sort of lane
3098 // interleaving to be used with zext/sext.
3099 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3100 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3101 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3102 return nullptr;
3103
3104 auto getScalarSizeWithOverride = [&](const Value *V) {
3105 if (SrcOverrideTy)
3106 return SrcOverrideTy->getScalarSizeInBits();
3107 return cast<Instruction>(V)
3108 ->getOperand(0)
3109 ->getType()
3110 ->getScalarSizeInBits();
3111 };
3112
3113 unsigned MaxEltSize = 0;
3114 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3115 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3116 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3117 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3118 MaxEltSize = std::max(EltSize0, EltSize1);
3119 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3120 isa<SExtInst, ZExtInst>(Args[1])) {
3121 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3122 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3123 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3124 // enough.
3125 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3126 return nullptr;
3127 MaxEltSize = DstEltSize / 2;
3128 } else if (Opcode == Instruction::Mul &&
3129 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3130 // If one of the operands is a Zext and the other has enough zero bits
3131 // to be treated as unsigned, we can still generate a umull, meaning the
3132 // zext is free.
3133 KnownBits Known =
3134 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3135 if (Args[0]->getType()->getScalarSizeInBits() -
3136 Known.Zero.countLeadingOnes() >
3137 DstTy->getScalarSizeInBits() / 2)
3138 return nullptr;
3139
3140 MaxEltSize =
3141 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3142 } else
3143 return nullptr;
3144
3145 if (MaxEltSize * 2 > DstEltSize)
3146 return nullptr;
3147
3148 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3149 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3150 return nullptr;
3151 return ExtTy;
3152}
3153
3154// s/urhadd instructions implement the following pattern, making the
3155// extends free:
3156// %x = add ((zext i8 -> i16), 1)
3157// %y = (zext i8 -> i16)
3158// trunc i16 (lshr (add %x, %y), 1) -> i8
3159//
3161 Type *Src) const {
3162 // The source should be a legal vector type.
3163 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3164 (Src->isScalableTy() && !ST->hasSVE2()))
3165 return false;
3166
3167 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3168 return false;
3169
3170 // Look for trunc/shl/add before trying to match the pattern.
3171 const Instruction *Add = ExtUser;
3172 auto *AddUser =
3173 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3174 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3175 Add = AddUser;
3176
3177 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3178 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3179 return false;
3180
3181 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3182 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3183 Src->getScalarSizeInBits() !=
3184 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3185 return false;
3186
3187 // Try to match the whole pattern. Ext could be either the first or second
3188 // m_ZExtOrSExt matched.
3189 Instruction *Ex1, *Ex2;
3190 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3191 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3192 return false;
3193
3194 // Ensure both extends are of the same type
3195 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3196 Ex1->getOpcode() == Ex2->getOpcode())
3197 return true;
3198
3199 return false;
3200}
3201
3203 Type *Src,
3206 const Instruction *I) const {
3207 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3208 assert(ISD && "Invalid opcode");
3209 // If the cast is observable, and it is used by a widening instruction (e.g.,
3210 // uaddl, saddw, etc.), it may be free.
3211 if (I && I->hasOneUser()) {
3212 auto *SingleUser = cast<Instruction>(*I->user_begin());
3213 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3214 if (Type *ExtTy = isBinExtWideningInstruction(
3215 SingleUser->getOpcode(), Dst, Operands,
3216 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3217 // The cost from Src->Src*2 needs to be added if required, the cost from
3218 // Src*2->ExtTy is free.
3219 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3220 Type *DoubleSrcTy =
3221 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3222 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3224 }
3225
3226 return 0;
3227 }
3228
3229 if (isSingleExtWideningInstruction(
3230 SingleUser->getOpcode(), Dst, Operands,
3231 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3232 // For adds only count the second operand as free if both operands are
3233 // extends but not the same operation. (i.e both operands are not free in
3234 // add(sext, zext)).
3235 if (SingleUser->getOpcode() == Instruction::Add) {
3236 if (I == SingleUser->getOperand(1) ||
3237 (isa<CastInst>(SingleUser->getOperand(1)) &&
3238 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3239 return 0;
3240 } else {
3241 // Others are free so long as isSingleExtWideningInstruction
3242 // returned true.
3243 return 0;
3244 }
3245 }
3246
3247 // The cast will be free for the s/urhadd instructions
3248 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3249 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3250 return 0;
3251 }
3252
3253 // TODO: Allow non-throughput costs that aren't binary.
3254 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3256 return Cost == 0 ? 0 : 1;
3257 return Cost;
3258 };
3259
3260 EVT SrcTy = TLI->getValueType(DL, Src);
3261 EVT DstTy = TLI->getValueType(DL, Dst);
3262
3263 if (!SrcTy.isSimple() || !DstTy.isSimple())
3264 return AdjustCost(
3265 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3266
3267 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3268 // we use fcvtx under SVE2. Give them invalid costs.
3269 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3270 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3271 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3273
3274 static const TypeConversionCostTblEntry BF16Tbl[] = {
3275 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3276 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3277 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3278 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3279 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3280 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3281 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3282 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3283 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3284 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3285 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3286 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3287 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3288 };
3289
3290 if (ST->hasBF16())
3291 if (const auto *Entry = ConvertCostTableLookup(
3292 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3293 return AdjustCost(Entry->Cost);
3294
3295 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3296 // The cost of unpacking twice is artificially increased for now in order
3297 // to avoid regressions against NEON, which will use tbl instructions directly
3298 // instead of multiple layers of [s|u]unpk[lo|hi].
3299 // We use the unpacks in cases where the destination type is illegal and
3300 // requires splitting of the input, even if the input type itself is legal.
3301 const unsigned int SVE_EXT_COST = 1;
3302 const unsigned int SVE_FCVT_COST = 1;
3303 const unsigned int SVE_UNPACK_ONCE = 4;
3304 const unsigned int SVE_UNPACK_TWICE = 16;
3305
3306 static const TypeConversionCostTblEntry ConversionTbl[] = {
3307 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3308 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3309 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3310 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3311 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3312 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3313 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3314 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3315 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3316 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3317 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3318 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3319 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3320 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3321 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3322 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3323 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3324 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3325 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3326 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3327
3328 // Truncations on nxvmiN
3329 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3330 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3331 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3332 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3333 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3334 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3335 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3336 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3337 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3338 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3339 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3340 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3341 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3342 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3343 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3344 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3345 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3346 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3347 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3348 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3349 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3350 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3351 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3352 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3353 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3354 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3355 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3356 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3357 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3358 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3359 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3360 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3361 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3362
3363 // The number of shll instructions for the extension.
3364 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3365 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3366 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3367 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3368 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3369 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3370 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3371 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3372 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3373 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3374 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3375 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3376 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3377 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3378 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3379 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3380
3381 // FP Ext and trunc
3382 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3383 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3384 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3385 // FP16
3386 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3387 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3388 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3389 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3390 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3391 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3392 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3393 // BF16 (uses shift)
3394 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3395 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3396 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3397 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3398 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3399 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3400 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3401 // FP Ext and trunc
3402 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3403 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3404 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3405 // FP16
3406 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3407 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3408 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3409 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3410 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3411 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3412 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3413 // BF16 (more complex, with +bf16 is handled above)
3414 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3415 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3416 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3417 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3418 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3419 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3420 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3421 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3422
3423 // LowerVectorINT_TO_FP:
3424 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3425 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3426 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3427 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3428 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3429 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3430
3431 // SVE: to nxv2f16
3432 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3433 SVE_EXT_COST + SVE_FCVT_COST},
3434 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3435 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3436 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3437 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3438 SVE_EXT_COST + SVE_FCVT_COST},
3439 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3440 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3441 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3442
3443 // SVE: to nxv4f16
3444 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3445 SVE_EXT_COST + SVE_FCVT_COST},
3446 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3447 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3448 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3449 SVE_EXT_COST + SVE_FCVT_COST},
3450 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3451 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3452
3453 // SVE: to nxv8f16
3454 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3455 SVE_EXT_COST + SVE_FCVT_COST},
3456 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3457 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3458 SVE_EXT_COST + SVE_FCVT_COST},
3459 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3460
3461 // SVE: to nxv16f16
3462 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3463 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3464 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3465 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3466
3467 // Complex: to v2f32
3468 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3469 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3470 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3471 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3472
3473 // SVE: to nxv2f32
3474 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3475 SVE_EXT_COST + SVE_FCVT_COST},
3476 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3477 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3478 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3479 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3480 SVE_EXT_COST + SVE_FCVT_COST},
3481 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3482 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3483 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3484
3485 // Complex: to v4f32
3486 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3487 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3488 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3489 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3490
3491 // SVE: to nxv4f32
3492 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3493 SVE_EXT_COST + SVE_FCVT_COST},
3494 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3495 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3496 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3497 SVE_EXT_COST + SVE_FCVT_COST},
3498 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3499 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3500
3501 // Complex: to v8f32
3502 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3503 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3504 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3505 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3506
3507 // SVE: to nxv8f32
3508 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3509 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3510 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3511 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3512 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3513 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3514 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3515 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3516
3517 // SVE: to nxv16f32
3518 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3519 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3520 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3521 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3522
3523 // Complex: to v16f32
3524 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3525 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3526
3527 // Complex: to v2f64
3528 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3529 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3530 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3531 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3532 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3533 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3534
3535 // SVE: to nxv2f64
3536 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3537 SVE_EXT_COST + SVE_FCVT_COST},
3538 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3539 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3540 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3541 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3542 SVE_EXT_COST + SVE_FCVT_COST},
3543 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3544 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3545 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3546
3547 // Complex: to v4f64
3548 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3549 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3550
3551 // SVE: to nxv4f64
3552 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3553 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3554 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3555 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3556 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3557 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3558 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3559 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3560 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3561 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3562 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3563 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3564
3565 // SVE: to nxv8f64
3566 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3567 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3568 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3569 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3570 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3571 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3572 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3573 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3574
3575 // LowerVectorFP_TO_INT
3576 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3577 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3578 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3579 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3580 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3581 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3582
3583 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3584 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3585 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3586 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3587 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3588 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3589 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3590
3591 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3592 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3593 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3594 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3595 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3596
3597 // Complex, from nxv2f32.
3598 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3599 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3600 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3601 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3602 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3603 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3604 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3605 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3606
3607 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3608 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3609 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3610 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3611 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3612 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3613 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3614
3615 // Complex, from nxv2f64.
3616 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3617 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3618 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3619 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3620 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3621 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3622 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3623 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3624 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3625 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3626
3627 // Complex, from nxv4f32.
3628 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3629 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3630 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3631 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3632 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3633 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3634 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3635 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3636 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3637 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3638
3639 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3640 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3641 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3642 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3643 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3644
3645 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3646 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3647 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3648 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3649 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3650 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3651 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3652
3653 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3654 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3655 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3656 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3657 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3658
3659 // Complex, from nxv8f16.
3660 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3661 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3662 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3663 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3664 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3665 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3666 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3667 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3668 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3669 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3670
3671 // Complex, from nxv4f16.
3672 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3673 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3674 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3675 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3676 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3677 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3678 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3679 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3680
3681 // Complex, from nxv2f16.
3682 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3683 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3684 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3685 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3686 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3687 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3688 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3689 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3690
3691 // Truncate from nxvmf32 to nxvmf16.
3692 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3693 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3694 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3695
3696 // Truncate from nxvmf32 to nxvmbf16.
3697 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3698 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3699 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3700
3701 // Truncate from nxvmf64 to nxvmf16.
3702 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3703 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3704 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3705
3706 // Truncate from nxvmf64 to nxvmbf16.
3707 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3708 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3709 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3710
3711 // Truncate from nxvmf64 to nxvmf32.
3712 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3713 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3714 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3715
3716 // Extend from nxvmf16 to nxvmf32.
3717 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3718 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3719 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3720
3721 // Extend from nxvmbf16 to nxvmf32.
3722 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3723 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3724 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3725
3726 // Extend from nxvmf16 to nxvmf64.
3727 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3728 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3729 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3730
3731 // Extend from nxvmbf16 to nxvmf64.
3732 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3733 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3734 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3735
3736 // Extend from nxvmf32 to nxvmf64.
3737 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3738 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3739 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3740
3741 // Bitcasts from float to integer
3742 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3743 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3744 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3745
3746 // Bitcasts from integer to float
3747 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3748 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3749 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3750
3751 // Add cost for extending to illegal -too wide- scalable vectors.
3752 // zero/sign extend are implemented by multiple unpack operations,
3753 // where each operation has a cost of 1.
3754 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3755 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3756 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3757 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3758 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3759 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3760
3761 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3762 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3763 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3764 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3765 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3766 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3767 };
3768
3769 // We have to estimate a cost of fixed length operation upon
3770 // SVE registers(operations) with the number of registers required
3771 // for a fixed type to be represented upon SVE registers.
3772 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3773 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3774 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3775 ST->useSVEForFixedLengthVectors(WiderTy)) {
3776 std::pair<InstructionCost, MVT> LT =
3777 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3778 unsigned NumElements =
3779 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3780 return AdjustCost(
3781 LT.first *
3783 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3784 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3785 CostKind, I));
3786 }
3787
3788 if (const auto *Entry = ConvertCostTableLookup(
3789 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3790 return AdjustCost(Entry->Cost);
3791
3792 static const TypeConversionCostTblEntry FP16Tbl[] = {
3793 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3794 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3795 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3796 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3797 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3798 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3799 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3800 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3801 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3802 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3803 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3804 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3805 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3806 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3807 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3808 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3809 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3810 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3811 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3812 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3813 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3814 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3815 };
3816
3817 if (ST->hasFullFP16())
3818 if (const auto *Entry = ConvertCostTableLookup(
3819 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3820 return AdjustCost(Entry->Cost);
3821
3822 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3823 // double-rounding issues.
3824 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3825 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3827 return AdjustCost(
3829 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3830 CCH, CostKind) +
3832 CostKind) +
3834 CostKind));
3835
3836 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3838 ST->isSVEorStreamingSVEAvailable() &&
3839 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3841 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3843 // The standard behaviour in the backend for these cases is to split the
3844 // extend up into two parts:
3845 // 1. Perform an extending load or masked load up to the legal type.
3846 // 2. Extend the loaded data to the final type.
3847 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3848 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3850 Opcode, LegalTy, Src, CCH, CostKind, I);
3852 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3853 return Part1 + Part2;
3854 }
3855
3856 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3857 // but we also want to include the TTI::CastContextHint::Masked case too.
3858 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3860 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3862
3863 return AdjustCost(
3864 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3865}
3866
3869 VectorType *VecTy, unsigned Index,
3871
3872 // Make sure we were given a valid extend opcode.
3873 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3874 "Invalid opcode");
3875
3876 // We are extending an element we extract from a vector, so the source type
3877 // of the extend is the element type of the vector.
3878 auto *Src = VecTy->getElementType();
3879
3880 // Sign- and zero-extends are for integer types only.
3881 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3882
3883 // Get the cost for the extract. We compute the cost (if any) for the extend
3884 // below.
3885 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3886 CostKind, Index, nullptr, nullptr);
3887
3888 // Legalize the types.
3889 auto VecLT = getTypeLegalizationCost(VecTy);
3890 auto DstVT = TLI->getValueType(DL, Dst);
3891 auto SrcVT = TLI->getValueType(DL, Src);
3892
3893 // If the resulting type is still a vector and the destination type is legal,
3894 // we may get the extension for free. If not, get the default cost for the
3895 // extend.
3896 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3897 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3898 CostKind);
3899
3900 // The destination type should be larger than the element type. If not, get
3901 // the default cost for the extend.
3902 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3903 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3904 CostKind);
3905
3906 switch (Opcode) {
3907 default:
3908 llvm_unreachable("Opcode should be either SExt or ZExt");
3909
3910 // For sign-extends, we only need a smov, which performs the extension
3911 // automatically.
3912 case Instruction::SExt:
3913 return Cost;
3914
3915 // For zero-extends, the extend is performed automatically by a umov unless
3916 // the destination type is i64 and the element type is i8 or i16.
3917 case Instruction::ZExt:
3918 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3919 return Cost;
3920 }
3921
3922 // If we are unable to perform the extend for free, get the default cost.
3923 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3924 CostKind);
3925}
3926
3929 const Instruction *I) const {
3931 return Opcode == Instruction::PHI ? 0 : 1;
3932 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3933 // Branches are assumed to be predicted.
3934 return 0;
3935}
3936
3937InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3938 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3939 const Instruction *I, Value *Scalar,
3940 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3941 assert(Val->isVectorTy() && "This must be a vector type");
3942
3943 if (Index != -1U) {
3944 // Legalize the type.
3945 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3946
3947 // This type is legalized to a scalar type.
3948 if (!LT.second.isVector())
3949 return 0;
3950
3951 // The type may be split. For fixed-width vectors we can normalize the
3952 // index to the new type.
3953 if (LT.second.isFixedLengthVector()) {
3954 unsigned Width = LT.second.getVectorNumElements();
3955 Index = Index % Width;
3956 }
3957
3958 // The element at index zero is already inside the vector.
3959 // - For a insert-element or extract-element
3960 // instruction that extracts integers, an explicit FPR -> GPR move is
3961 // needed. So it has non-zero cost.
3962 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3963 return 0;
3964
3965 // This is recognising a LD1 single-element structure to one lane of one
3966 // register instruction. I.e., if this is an `insertelement` instruction,
3967 // and its second operand is a load, then we will generate a LD1, which
3968 // are expensive instructions.
3969 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3970 return CostKind == TTI::TCK_CodeSize
3971 ? 0
3973
3974 // i1 inserts and extract will include an extra cset or cmp of the vector
3975 // value. Increase the cost by 1 to account.
3976 if (Val->getScalarSizeInBits() == 1)
3977 return CostKind == TTI::TCK_CodeSize
3978 ? 2
3980
3981 // FIXME:
3982 // If the extract-element and insert-element instructions could be
3983 // simplified away (e.g., could be combined into users by looking at use-def
3984 // context), they have no cost. This is not done in the first place for
3985 // compile-time considerations.
3986 }
3987
3988 // In case of Neon, if there exists extractelement from lane != 0 such that
3989 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3990 // 2. extractelement result feeds into fmul.
3991 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3992 // equivalent to 0.
3993 // then the extractelement can be merged with fmul in the backend and it
3994 // incurs no cost.
3995 // e.g.
3996 // define double @foo(<2 x double> %a) {
3997 // %1 = extractelement <2 x double> %a, i32 0
3998 // %2 = extractelement <2 x double> %a, i32 1
3999 // %res = fmul double %1, %2
4000 // ret double %res
4001 // }
4002 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4003 auto ExtractCanFuseWithFmul = [&]() {
4004 // We bail out if the extract is from lane 0.
4005 if (Index == 0)
4006 return false;
4007
4008 // Check if the scalar element type of the vector operand of ExtractElement
4009 // instruction is one of the allowed types.
4010 auto IsAllowedScalarTy = [&](const Type *T) {
4011 return T->isFloatTy() || T->isDoubleTy() ||
4012 (T->isHalfTy() && ST->hasFullFP16());
4013 };
4014
4015 // Check if the extractelement user is scalar fmul.
4016 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4017 // Check if the user is scalar fmul.
4018 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4019 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4020 !BO->getType()->isVectorTy();
4021 };
4022
4023 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4024 // certain scalar type and a certain vector register width.
4025 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4026 auto RegWidth =
4028 .getFixedValue();
4029 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4030 };
4031
4032 // Check if the type constraints on input vector type and result scalar type
4033 // of extractelement instruction are satisfied.
4034 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4035 return false;
4036
4037 if (Scalar) {
4038 DenseMap<User *, unsigned> UserToExtractIdx;
4039 for (auto *U : Scalar->users()) {
4040 if (!IsUserFMulScalarTy(U))
4041 return false;
4042 // Recording entry for the user is important. Index value is not
4043 // important.
4044 UserToExtractIdx[U];
4045 }
4046 if (UserToExtractIdx.empty())
4047 return false;
4048 for (auto &[S, U, L] : ScalarUserAndIdx) {
4049 for (auto *U : S->users()) {
4050 if (UserToExtractIdx.contains(U)) {
4051 auto *FMul = cast<BinaryOperator>(U);
4052 auto *Op0 = FMul->getOperand(0);
4053 auto *Op1 = FMul->getOperand(1);
4054 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4055 UserToExtractIdx[U] = L;
4056 break;
4057 }
4058 }
4059 }
4060 }
4061 for (auto &[U, L] : UserToExtractIdx) {
4062 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4063 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4064 return false;
4065 }
4066 } else {
4067 const auto *EE = cast<ExtractElementInst>(I);
4068
4069 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4070 if (!IdxOp)
4071 return false;
4072
4073 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4074 if (!IsUserFMulScalarTy(U))
4075 return false;
4076
4077 // Check if the other operand of extractelement is also extractelement
4078 // from lane equivalent to 0.
4079 const auto *BO = cast<BinaryOperator>(U);
4080 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4081 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4082 if (OtherEE) {
4083 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4084 if (!IdxOp)
4085 return false;
4086 return IsExtractLaneEquivalentToZero(
4087 cast<ConstantInt>(OtherEE->getIndexOperand())
4088 ->getValue()
4089 .getZExtValue(),
4090 OtherEE->getType()->getScalarSizeInBits());
4091 }
4092 return true;
4093 });
4094 }
4095 return true;
4096 };
4097
4098 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4099 ExtractCanFuseWithFmul())
4100 return 0;
4101
4102 // All other insert/extracts cost this much.
4103 return CostKind == TTI::TCK_CodeSize ? 1
4104 : ST->getVectorInsertExtractBaseCost();
4105}
4106
4109 unsigned Index,
4110 const Value *Op0,
4111 const Value *Op1) const {
4112 // Treat insert at lane 0 into a poison vector as having zero cost. This
4113 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4114 // single dup) are treated as cheap.
4115 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4116 isa<PoisonValue>(Op0))
4117 return 0;
4118 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4119}
4120
4122 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4123 Value *Scalar,
4124 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4125 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4126 ScalarUserAndIdx);
4127}
4128
4130 Type *Val,
4132 unsigned Index) const {
4133 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4134}
4135
4139 unsigned Index) const {
4140 if (isa<FixedVectorType>(Val))
4142 Index);
4143
4144 // This typically requires both while and lastb instructions in order
4145 // to extract the last element. If this is in a loop the while
4146 // instruction can at least be hoisted out, although it will consume a
4147 // predicate register. The cost should be more expensive than the base
4148 // extract cost, which is 2 for most CPUs.
4149 return CostKind == TTI::TCK_CodeSize
4150 ? 2
4151 : ST->getVectorInsertExtractBaseCost() + 1;
4152}
4153
4155 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4156 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4157 ArrayRef<Value *> VL) const {
4160 if (Ty->getElementType()->isFloatingPointTy())
4161 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4162 CostKind);
4163 unsigned VecInstCost =
4164 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4165 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4166}
4167
4168std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4170 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4171 std::function<InstructionCost(Type *)> InstCost) const {
4172 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4173 return std::nullopt;
4174 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4175 return std::nullopt;
4176 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4177 ST->isNonStreamingSVEorSME2Available())
4178 return std::nullopt;
4179
4180 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4181 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4183 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4184 Cost *= 2;
4185 Cost += InstCost(PromotedTy);
4186 if (IncludeTrunc)
4187 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4189 return Cost;
4190}
4191
4193 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4195 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4196
4197 // The code-generator is currently not able to handle scalable vectors
4198 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4199 // it. This change will be removed when code-generation for these types is
4200 // sufficiently reliable.
4201 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4202 if (VTy->getElementCount() == ElementCount::getScalable(1))
4204
4205 // TODO: Handle more cost kinds.
4207 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4208 Op2Info, Args, CxtI);
4209
4210 // Legalize the type.
4211 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4212 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4213
4214 // Increase the cost for half and bfloat types if not architecturally
4215 // supported.
4216 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4217 ISD == ISD::FDIV || ISD == ISD::FREM)
4218 if (auto PromotedCost = getFP16BF16PromoteCost(
4219 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4220 // There is not native support for fdiv/frem even with +sve-b16b16.
4221 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4222 [&](Type *PromotedTy) {
4223 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4224 Op1Info, Op2Info);
4225 }))
4226 return *PromotedCost;
4227
4228 // If the operation is a widening instruction (smull or umull) and both
4229 // operands are extends the cost can be cheaper by considering that the
4230 // operation will operate on the narrowest type size possible (double the
4231 // largest input size) and a further extend.
4232 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4233 if (ExtTy != Ty)
4234 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4235 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4237 return LT.first;
4238 }
4239
4240 switch (ISD) {
4241 default:
4242 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4243 Op2Info);
4244 case ISD::SREM:
4245 case ISD::SDIV:
4246 /*
4247 Notes for sdiv/srem specific costs:
4248 1. This only considers the cases where the divisor is constant, uniform and
4249 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4250 result in some form of (ldr + adrp), corresponding to constant vectors, or
4251 scalarization of the division operation.
4252 2. Constant divisors, either negative in whole or partially, don't result in
4253 significantly different codegen as compared to positive constant divisors.
4254 So, we don't consider negative divisors separately.
4255 3. If the codegen is significantly different with SVE, it has been indicated
4256 using comments at appropriate places.
4257
4258 sdiv specific cases:
4259 -----------------------------------------------------------------------
4260 codegen | pow-of-2 | Type
4261 -----------------------------------------------------------------------
4262 add + cmp + csel + asr | Y | i64
4263 add + cmp + csel + asr | Y | i32
4264 -----------------------------------------------------------------------
4265
4266 srem specific cases:
4267 -----------------------------------------------------------------------
4268 codegen | pow-of-2 | Type
4269 -----------------------------------------------------------------------
4270 negs + and + and + csneg | Y | i64
4271 negs + and + and + csneg | Y | i32
4272 -----------------------------------------------------------------------
4273
4274 other sdiv/srem cases:
4275 -------------------------------------------------------------------------
4276 common codegen | + srem | + sdiv | pow-of-2 | Type
4277 -------------------------------------------------------------------------
4278 smulh + asr + add + add | - | - | N | i64
4279 smull + lsr + add + add | - | - | N | i32
4280 usra | and + sub | sshr | Y | <2 x i64>
4281 2 * (scalar code) | - | - | N | <2 x i64>
4282 usra | bic + sub | sshr + neg | Y | <4 x i32>
4283 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4284 + sshr + usra | | | |
4285 -------------------------------------------------------------------------
4286 */
4287 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4288 InstructionCost AddCost =
4289 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4290 Op1Info.getNoProps(), Op2Info.getNoProps());
4291 InstructionCost AsrCost =
4292 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4293 Op1Info.getNoProps(), Op2Info.getNoProps());
4294 InstructionCost MulCost =
4295 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4296 Op1Info.getNoProps(), Op2Info.getNoProps());
4297 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4298 // have similar cost.
4299 auto VT = TLI->getValueType(DL, Ty);
4300 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4301 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4302 // Neg can be folded into the asr instruction.
4303 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4304 : (3 * AsrCost + AddCost);
4305 } else {
4306 return MulCost + AsrCost + 2 * AddCost;
4307 }
4308 } else if (VT.isVector()) {
4309 InstructionCost UsraCost = 2 * AsrCost;
4310 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4311 // Division with scalable types corresponds to native 'asrd'
4312 // instruction when SVE is available.
4313 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4314
4315 // One more for the negation in SDIV
4317 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4318 if (Ty->isScalableTy() && ST->hasSVE())
4319 Cost += 2 * AsrCost;
4320 else {
4321 Cost +=
4322 UsraCost +
4323 (ISD == ISD::SDIV
4324 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4325 : 2 * AddCost);
4326 }
4327 return Cost;
4328 } else if (LT.second == MVT::v2i64) {
4329 return VT.getVectorNumElements() *
4330 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4331 Op1Info.getNoProps(),
4332 Op2Info.getNoProps());
4333 } else {
4334 // When SVE is available, we get:
4335 // smulh + lsr + add/sub + asr + add/sub.
4336 if (Ty->isScalableTy() && ST->hasSVE())
4337 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4338 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4339 }
4340 }
4341 }
4342 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4343 LT.second.isFixedLengthVector()) {
4344 // FIXME: When the constant vector is non-uniform, this may result in
4345 // loading the vector from constant pool or in some cases, may also result
4346 // in scalarization. For now, we are approximating this with the
4347 // scalarization cost.
4348 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4349 CostKind, -1, nullptr, nullptr);
4350 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4351 CostKind, -1, nullptr, nullptr);
4352 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4353 return ExtractCost + InsertCost +
4354 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4355 CostKind, Op1Info.getNoProps(),
4356 Op2Info.getNoProps());
4357 }
4358 [[fallthrough]];
4359 case ISD::UDIV:
4360 case ISD::UREM: {
4361 auto VT = TLI->getValueType(DL, Ty);
4362 if (Op2Info.isConstant()) {
4363 // If the operand is a power of 2 we can use the shift or and cost.
4364 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4365 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4366 Op1Info.getNoProps(),
4367 Op2Info.getNoProps());
4368 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4369 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4370 Op1Info.getNoProps(),
4371 Op2Info.getNoProps());
4372
4373 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4374 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4375 // The MULHU will be expanded to UMULL for the types not listed below,
4376 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4377 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4378 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4379 LT.second == MVT::nxv16i8;
4380 bool Is128bit = LT.second.is128BitVector();
4381
4382 InstructionCost MulCost =
4383 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4384 Op1Info.getNoProps(), Op2Info.getNoProps());
4385 InstructionCost AddCost =
4386 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4387 Op1Info.getNoProps(), Op2Info.getNoProps());
4388 InstructionCost ShrCost =
4389 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4390 Op1Info.getNoProps(), Op2Info.getNoProps());
4391 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4392 (HasMULH ? 0 : ShrCost) + // UMULL shift
4393 AddCost * 2 + ShrCost;
4394 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4395 }
4396 }
4397
4398 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4399 // emitted by the backend even when those functions are not declared in the
4400 // module.
4401 if (!VT.isVector() && VT.getSizeInBits() > 64)
4402 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4403
4405 Opcode, Ty, CostKind, Op1Info, Op2Info);
4406 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4407 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4408 // SDIV/UDIV operations are lowered using SVE, then we can have less
4409 // costs.
4410 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4411 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4412 static const CostTblEntry DivTbl[]{
4413 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4414 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4415 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4416 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4417 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4418 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4419
4420 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4421 if (nullptr != Entry)
4422 return Entry->Cost;
4423 }
4424 // For 8/16-bit elements, the cost is higher because the type
4425 // requires promotion and possibly splitting:
4426 if (LT.second.getScalarType() == MVT::i8)
4427 Cost *= 8;
4428 else if (LT.second.getScalarType() == MVT::i16)
4429 Cost *= 4;
4430 return Cost;
4431 } else {
4432 // If one of the operands is a uniform constant then the cost for each
4433 // element is Cost for insertion, extraction and division.
4434 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4435 // operation with scalar type
4436 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4437 (Op2Info.isConstant() && Op2Info.isUniform())) {
4438 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4440 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4441 return (4 + DivCost) * VTy->getNumElements();
4442 }
4443 }
4444 // On AArch64, without SVE, vector divisions are expanded
4445 // into scalar divisions of each pair of elements.
4446 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4447 -1, nullptr, nullptr);
4448 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4449 nullptr, nullptr);
4450 }
4451
4452 // TODO: if one of the arguments is scalar, then it's not necessary to
4453 // double the cost of handling the vector elements.
4454 Cost += Cost;
4455 }
4456 return Cost;
4457 }
4458 case ISD::MUL:
4459 // When SVE is available, then we can lower the v2i64 operation using
4460 // the SVE mul instruction, which has a lower cost.
4461 if (LT.second == MVT::v2i64 && ST->hasSVE())
4462 return LT.first;
4463
4464 // When SVE is not available, there is no MUL.2d instruction,
4465 // which means mul <2 x i64> is expensive as elements are extracted
4466 // from the vectors and the muls scalarized.
4467 // As getScalarizationOverhead is a bit too pessimistic, we
4468 // estimate the cost for a i64 vector directly here, which is:
4469 // - four 2-cost i64 extracts,
4470 // - two 2-cost i64 inserts, and
4471 // - two 1-cost muls.
4472 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4473 // LT.first = 2 the cost is 28.
4474 if (LT.second != MVT::v2i64)
4475 return LT.first;
4476 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4477 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4478 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4479 nullptr, nullptr) *
4480 2 +
4481 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4482 nullptr, nullptr));
4483 case ISD::ADD:
4484 case ISD::XOR:
4485 case ISD::OR:
4486 case ISD::AND:
4487 case ISD::SRL:
4488 case ISD::SRA:
4489 case ISD::SHL:
4490 // These nodes are marked as 'custom' for combining purposes only.
4491 // We know that they are legal. See LowerAdd in ISelLowering.
4492 return LT.first;
4493
4494 case ISD::FNEG:
4495 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4496 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4497 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4498 CxtI &&
4499 ((CxtI->hasOneUse() &&
4500 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4501 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4502 return 0;
4503 [[fallthrough]];
4504 case ISD::FADD:
4505 case ISD::FSUB:
4506 if (!Ty->getScalarType()->isFP128Ty())
4507 return LT.first;
4508 [[fallthrough]];
4509 case ISD::FMUL:
4510 case ISD::FDIV:
4511 // These nodes are marked as 'custom' just to lower them to SVE.
4512 // We know said lowering will incur no additional cost.
4513 if (!Ty->getScalarType()->isFP128Ty())
4514 return 2 * LT.first;
4515
4516 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4517 Op2Info);
4518 case ISD::FREM:
4519 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4520 // those functions are not declared in the module.
4521 if (!Ty->isVectorTy())
4522 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4523 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4524 Op2Info);
4525 }
4526}
4527
4530 const SCEV *Ptr,
4532 // Address computations in vectorized code with non-consecutive addresses will
4533 // likely result in more instructions compared to scalar code where the
4534 // computation can more often be merged into the index mode. The resulting
4535 // extra micro-ops can significantly decrease throughput.
4536 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4537 int MaxMergeDistance = 64;
4538
4539 if (PtrTy->isVectorTy() && SE &&
4540 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4541 return NumVectorInstToHideOverhead;
4542
4543 // In many cases the address computation is not merged into the instruction
4544 // addressing mode.
4545 return 1;
4546}
4547
4548/// Check whether Opcode1 has less throughput according to the scheduling
4549/// model than Opcode2.
4551 unsigned Opcode1, unsigned Opcode2) const {
4552 const MCSchedModel &Sched = ST->getSchedModel();
4553 const TargetInstrInfo *TII = ST->getInstrInfo();
4554 if (!Sched.hasInstrSchedModel())
4555 return false;
4556
4557 const MCSchedClassDesc *SCD1 =
4558 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4559 const MCSchedClassDesc *SCD2 =
4560 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4561 // We cannot handle variant scheduling classes without an MI. If we need to
4562 // support them for any of the instructions we query the information of we
4563 // might need to add a way to resolve them without a MI or not use the
4564 // scheduling info.
4565 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4566 "Cannot handle variant scheduling classes without an MI");
4567 if (!SCD1->isValid() || !SCD2->isValid())
4568 return false;
4569
4570 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4572}
4573
4575 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4577 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4578 // We don't lower some vector selects well that are wider than the register
4579 // width. TODO: Improve this with different cost kinds.
4580 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4581 // We would need this many instructions to hide the scalarization happening.
4582 const int AmortizationCost = 20;
4583
4584 // If VecPred is not set, check if we can get a predicate from the context
4585 // instruction, if its type matches the requested ValTy.
4586 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4587 CmpPredicate CurrentPred;
4588 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4589 m_Value())))
4590 VecPred = CurrentPred;
4591 }
4592 // Check if we have a compare/select chain that can be lowered using
4593 // a (F)CMxx & BFI pair.
4594 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4595 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4596 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4597 VecPred == CmpInst::FCMP_UNE) {
4598 static const auto ValidMinMaxTys = {
4599 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4600 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4601 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4602
4603 auto LT = getTypeLegalizationCost(ValTy);
4604 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4605 (ST->hasFullFP16() &&
4606 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4607 return LT.first;
4608 }
4609
4610 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4611 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4612 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4613 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4614 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4615 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4616 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4617 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4618 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4619 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4620 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4621 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4622
4623 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4624 EVT SelValTy = TLI->getValueType(DL, ValTy);
4625 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4626 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4627 SelCondTy.getSimpleVT(),
4628 SelValTy.getSimpleVT()))
4629 return Entry->Cost;
4630 }
4631 }
4632
4633 if (Opcode == Instruction::FCmp) {
4634 if (auto PromotedCost = getFP16BF16PromoteCost(
4635 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4636 // TODO: Consider costing SVE FCMPs.
4637 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4639 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4640 CostKind, Op1Info, Op2Info);
4641 if (isa<VectorType>(PromotedTy))
4643 Instruction::Trunc,
4647 return Cost;
4648 }))
4649 return *PromotedCost;
4650
4651 auto LT = getTypeLegalizationCost(ValTy);
4652 // Model unknown fp compares as a libcall.
4653 if (LT.second.getScalarType() != MVT::f64 &&
4654 LT.second.getScalarType() != MVT::f32 &&
4655 LT.second.getScalarType() != MVT::f16)
4656 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4657 {ValTy, ValTy}, CostKind);
4658
4659 // Some comparison operators require expanding to multiple compares + or.
4660 unsigned Factor = 1;
4661 if (!CondTy->isVectorTy() &&
4662 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4663 Factor = 2; // fcmp with 2 selects
4664 else if (isa<FixedVectorType>(ValTy) &&
4665 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4666 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4667 Factor = 3; // fcmxx+fcmyy+or
4668 else if (isa<ScalableVectorType>(ValTy) &&
4669 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4670 Factor = 3; // fcmxx+fcmyy+or
4671
4672 if (isa<ScalableVectorType>(ValTy) &&
4674 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4675 AArch64::FCMEQv4f32))
4676 Factor *= 2;
4677
4678 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4679 }
4680
4681 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4682 // icmp(and, 0) as free, as we can make use of ands, but only if the
4683 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4684 // providing it will not cause performance regressions.
4685 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4686 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4687 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4688 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4689 if (match(I->getOperand(1), m_Zero()))
4690 return 0;
4691
4692 // x >= 1 / x < 1 -> x > 0 / x <= 0
4693 if (match(I->getOperand(1), m_One()) &&
4694 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4695 return 0;
4696
4697 // x <= -1 / x > -1 -> x > 0 / x <= 0
4698 if (match(I->getOperand(1), m_AllOnes()) &&
4699 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4700 return 0;
4701 }
4702
4703 // The base case handles scalable vectors fine for now, since it treats the
4704 // cost as 1 * legalization cost.
4705 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4706 Op1Info, Op2Info, I);
4707}
4708
4710AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4712 if (ST->requiresStrictAlign()) {
4713 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4714 // a bunch of instructions when strict align is enabled.
4715 return Options;
4716 }
4717 Options.AllowOverlappingLoads = true;
4718 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4719 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4720 // TODO: Though vector loads usually perform well on AArch64, in some targets
4721 // they may wake up the FP unit, which raises the power consumption. Perhaps
4722 // they could be used with no holds barred (-O3).
4723 Options.LoadSizes = {8, 4, 2, 1};
4724 Options.AllowedTailExpansions = {3, 5, 6};
4725 return Options;
4726}
4727
4729 return ST->hasSVE();
4730}
4731
4735 Type *Src = MICA.getDataType();
4736
4737 if (useNeonVector(Src))
4739 auto LT = getTypeLegalizationCost(Src);
4740 if (!LT.first.isValid())
4742
4743 // Return an invalid cost for element types that we are unable to lower.
4744 auto *VT = cast<VectorType>(Src);
4745 if (VT->getElementType()->isIntegerTy(1))
4747
4748 // The code-generator is currently not able to handle scalable vectors
4749 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4750 // it. This change will be removed when code-generation for these types is
4751 // sufficiently reliable.
4752 if (VT->getElementCount() == ElementCount::getScalable(1))
4754
4755 return LT.first;
4756}
4757
4758// This function returns gather/scatter overhead either from
4759// user-provided value or specialized values per-target from \p ST.
4760static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4761 const AArch64Subtarget *ST) {
4762 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4763 "Should be called on only load or stores.");
4764 switch (Opcode) {
4765 case Instruction::Load:
4766 if (SVEGatherOverhead.getNumOccurrences() > 0)
4767 return SVEGatherOverhead;
4768 return ST->getGatherOverhead();
4769 break;
4770 case Instruction::Store:
4771 if (SVEScatterOverhead.getNumOccurrences() > 0)
4772 return SVEScatterOverhead;
4773 return ST->getScatterOverhead();
4774 break;
4775 default:
4776 llvm_unreachable("Shouldn't have reached here");
4777 }
4778}
4779
4781 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4782 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4783 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4784 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4785 Alignment, CostKind, I);
4786 auto *VT = cast<VectorType>(DataTy);
4787 auto LT = getTypeLegalizationCost(DataTy);
4788 if (!LT.first.isValid())
4790
4791 // Return an invalid cost for element types that we are unable to lower.
4792 if (!LT.second.isVector() ||
4793 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4794 VT->getElementType()->isIntegerTy(1))
4796
4797 // The code-generator is currently not able to handle scalable vectors
4798 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4799 // it. This change will be removed when code-generation for these types is
4800 // sufficiently reliable.
4801 if (VT->getElementCount() == ElementCount::getScalable(1))
4803
4804 ElementCount LegalVF = LT.second.getVectorElementCount();
4805 InstructionCost MemOpCost =
4806 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4807 {TTI::OK_AnyValue, TTI::OP_None}, I);
4808 // Add on an overhead cost for using gathers/scatters.
4809 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4810 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4811}
4812
4814 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4815}
4816
4818 Align Alignment,
4819 unsigned AddressSpace,
4821 TTI::OperandValueInfo OpInfo,
4822 const Instruction *I) const {
4823 EVT VT = TLI->getValueType(DL, Ty, true);
4824 // Type legalization can't handle structs
4825 if (VT == MVT::Other)
4826 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4827 CostKind);
4828
4829 auto LT = getTypeLegalizationCost(Ty);
4830 if (!LT.first.isValid())
4832
4833 // The code-generator is currently not able to handle scalable vectors
4834 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4835 // it. This change will be removed when code-generation for these types is
4836 // sufficiently reliable.
4837 // We also only support full register predicate loads and stores.
4838 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4839 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4840 (VTy->getElementType()->isIntegerTy(1) &&
4841 !VTy->getElementCount().isKnownMultipleOf(
4844
4845 // TODO: consider latency as well for TCK_SizeAndLatency.
4847 return LT.first;
4848
4850 return 1;
4851
4852 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4853 LT.second.is128BitVector() && Alignment < Align(16)) {
4854 // Unaligned stores are extremely inefficient. We don't split all
4855 // unaligned 128-bit stores because the negative impact that has shown in
4856 // practice on inlined block copy code.
4857 // We make such stores expensive so that we will only vectorize if there
4858 // are 6 other instructions getting vectorized.
4859 const int AmortizationCost = 6;
4860
4861 return LT.first * 2 * AmortizationCost;
4862 }
4863
4864 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4865 if (Ty->isPtrOrPtrVectorTy())
4866 return LT.first;
4867
4868 if (useNeonVector(Ty)) {
4869 // Check truncating stores and extending loads.
4870 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4871 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4872 if (VT == MVT::v4i8)
4873 return 2;
4874 // Otherwise we need to scalarize.
4875 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4876 }
4877 EVT EltVT = VT.getVectorElementType();
4878 unsigned EltSize = EltVT.getScalarSizeInBits();
4879 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4880 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4881 return LT.first;
4882 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4883 // widening to v4i8, which produces suboptimal results.
4884 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4885 return LT.first;
4886
4887 // Check non-power-of-2 loads/stores for legal vector element types with
4888 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4889 // operations on smaller power-of-2 ops, including ld1/st1.
4890 LLVMContext &C = Ty->getContext();
4892 SmallVector<EVT> TypeWorklist;
4893 TypeWorklist.push_back(VT);
4894 while (!TypeWorklist.empty()) {
4895 EVT CurrVT = TypeWorklist.pop_back_val();
4896 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4897 if (isPowerOf2_32(CurrNumElements)) {
4898 Cost += 1;
4899 continue;
4900 }
4901
4902 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4903 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4904 TypeWorklist.push_back(
4905 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4906 }
4907 return Cost;
4908 }
4909
4910 return LT.first;
4911}
4912
4914 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4915 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4916 bool UseMaskForCond, bool UseMaskForGaps) const {
4917 assert(Factor >= 2 && "Invalid interleave factor");
4918 auto *VecVTy = cast<VectorType>(VecTy);
4919
4920 if (VecTy->isScalableTy() && !ST->hasSVE())
4922
4923 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4924 // only have lowering for power-of-2 factors.
4925 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4926 // InterleavedAccessPass for ld3/st3
4927 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4929
4930 // Vectorization for masked interleaved accesses is only enabled for scalable
4931 // VF.
4932 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4934
4935 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4936 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4937 auto *SubVecTy =
4938 VectorType::get(VecVTy->getElementType(),
4939 VecVTy->getElementCount().divideCoefficientBy(Factor));
4940
4941 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4942 // Accesses having vector types that are a multiple of 128 bits can be
4943 // matched to more than one ldN/stN instruction.
4944 bool UseScalable;
4945 if (MinElts % Factor == 0 &&
4946 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4947 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4948 }
4949
4950 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4951 Alignment, AddressSpace, CostKind,
4952 UseMaskForCond, UseMaskForGaps);
4953}
4954
4959 for (auto *I : Tys) {
4960 if (!I->isVectorTy())
4961 continue;
4962 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4963 128)
4964 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4965 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4966 }
4967 return Cost;
4968}
4969
4971 return ST->getMaxInterleaveFactor();
4972}
4973
4974// For Falkor, we want to avoid having too many strided loads in a loop since
4975// that can exhaust the HW prefetcher resources. We adjust the unroller
4976// MaxCount preference below to attempt to ensure unrolling doesn't create too
4977// many strided loads.
4978static void
4981 enum { MaxStridedLoads = 7 };
4982 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4983 int StridedLoads = 0;
4984 // FIXME? We could make this more precise by looking at the CFG and
4985 // e.g. not counting loads in each side of an if-then-else diamond.
4986 for (const auto BB : L->blocks()) {
4987 for (auto &I : *BB) {
4988 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4989 if (!LMemI)
4990 continue;
4991
4992 Value *PtrValue = LMemI->getPointerOperand();
4993 if (L->isLoopInvariant(PtrValue))
4994 continue;
4995
4996 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4997 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4998 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4999 continue;
5000
5001 // FIXME? We could take pairing of unrolled load copies into account
5002 // by looking at the AddRec, but we would probably have to limit this
5003 // to loops with no stores or other memory optimization barriers.
5004 ++StridedLoads;
5005 // We've seen enough strided loads that seeing more won't make a
5006 // difference.
5007 if (StridedLoads > MaxStridedLoads / 2)
5008 return StridedLoads;
5009 }
5010 }
5011 return StridedLoads;
5012 };
5013
5014 int StridedLoads = countStridedLoads(L, SE);
5015 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5016 << " strided loads\n");
5017 // Pick the largest power of 2 unroll count that won't result in too many
5018 // strided loads.
5019 if (StridedLoads) {
5020 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5021 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5022 << UP.MaxCount << '\n');
5023 }
5024}
5025
5026// This function returns true if the loop:
5027// 1. Has a valid cost, and
5028// 2. Has a cost within the supplied budget.
5029// Otherwise it returns false.
5031 InstructionCost Budget,
5032 unsigned *FinalSize) {
5033 // Estimate the size of the loop.
5034 InstructionCost LoopCost = 0;
5035
5036 for (auto *BB : L->getBlocks()) {
5037 for (auto &I : *BB) {
5038 SmallVector<const Value *, 4> Operands(I.operand_values());
5039 InstructionCost Cost =
5040 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5041 // This can happen with intrinsics that don't currently have a cost model
5042 // or for some operations that require SVE.
5043 if (!Cost.isValid())
5044 return false;
5045
5046 LoopCost += Cost;
5047 if (LoopCost > Budget)
5048 return false;
5049 }
5050 }
5051
5052 if (FinalSize)
5053 *FinalSize = LoopCost.getValue();
5054 return true;
5055}
5056
5058 const AArch64TTIImpl &TTI) {
5059 // Only consider loops with unknown trip counts for which we can determine
5060 // a symbolic expression. Multi-exit loops with small known trip counts will
5061 // likely be unrolled anyway.
5062 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5064 return false;
5065
5066 // It might not be worth unrolling loops with low max trip counts. Restrict
5067 // this to max trip counts > 32 for now.
5068 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5069 if (MaxTC > 0 && MaxTC <= 32)
5070 return false;
5071
5072 // Make sure the loop size is <= 5.
5073 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5074 return false;
5075
5076 // Small search loops with multiple exits can be highly beneficial to unroll.
5077 // We only care about loops with exactly two exiting blocks, although each
5078 // block could jump to the same exit block.
5079 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5080 if (Blocks.size() != 2)
5081 return false;
5082
5083 if (any_of(Blocks, [](BasicBlock *BB) {
5084 return !isa<BranchInst>(BB->getTerminator());
5085 }))
5086 return false;
5087
5088 return true;
5089}
5090
5091/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5092/// OOO engine's wide instruction window and various predictors.
5093static void
5096 const AArch64TTIImpl &TTI) {
5097 // Limit loops with structure that is highly likely to benefit from runtime
5098 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5099 // likely with complex control flow). Note that the heuristics here may be
5100 // overly conservative and we err on the side of avoiding runtime unrolling
5101 // rather than unroll excessively. They are all subject to further refinement.
5102 if (!L->isInnermost() || L->getNumBlocks() > 8)
5103 return;
5104
5105 // Loops with multiple exits are handled by common code.
5106 if (!L->getExitBlock())
5107 return;
5108
5109 // Check if the loop contains any reductions that could be parallelized when
5110 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5111 // a multiple of 2.
5112 bool HasParellelizableReductions =
5113 L->getNumBlocks() == 1 &&
5114 any_of(L->getHeader()->phis(),
5115 [&SE, L](PHINode &Phi) {
5116 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5117 }) &&
5118 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5119 if (HasParellelizableReductions &&
5120 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5121 UP.Partial = true;
5122 UP.MaxCount = 4;
5123 UP.AddAdditionalAccumulators = true;
5124 }
5125
5126 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5128 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5129 SE.getSmallConstantMaxTripCount(L) <= 32))
5130 return;
5131
5132 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5133 return;
5134
5136 return;
5137
5138 // Limit to loops with trip counts that are cheap to expand.
5139 UP.SCEVExpansionBudget = 1;
5140
5141 if (HasParellelizableReductions) {
5142 UP.Runtime = true;
5144 UP.AddAdditionalAccumulators = true;
5145 }
5146
5147 // Try to unroll small loops, of few-blocks with low budget, if they have
5148 // load/store dependencies, to expose more parallel memory access streams,
5149 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5150 BasicBlock *Header = L->getHeader();
5151 BasicBlock *Latch = L->getLoopLatch();
5152 if (Header == Latch) {
5153 // Estimate the size of the loop.
5154 unsigned Size;
5155 unsigned Width = 10;
5156 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5157 return;
5158
5159 // Try to find an unroll count that maximizes the use of the instruction
5160 // window, i.e. trying to fetch as many instructions per cycle as possible.
5161 unsigned MaxInstsPerLine = 16;
5162 unsigned UC = 1;
5163 unsigned BestUC = 1;
5164 unsigned SizeWithBestUC = BestUC * Size;
5165 while (UC <= 8) {
5166 unsigned SizeWithUC = UC * Size;
5167 if (SizeWithUC > 48)
5168 break;
5169 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5170 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5171 BestUC = UC;
5172 SizeWithBestUC = BestUC * Size;
5173 }
5174 UC++;
5175 }
5176
5177 if (BestUC == 1)
5178 return;
5179
5180 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5182 for (auto *BB : L->blocks()) {
5183 for (auto &I : *BB) {
5185 if (!Ptr)
5186 continue;
5187 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5188 if (SE.isLoopInvariant(PtrSCEV, L))
5189 continue;
5190 if (isa<LoadInst>(&I)) {
5191 LoadedValuesPlus.insert(&I);
5192 // Include in-loop 1st users of loaded values.
5193 for (auto *U : I.users())
5194 if (L->contains(cast<Instruction>(U)))
5195 LoadedValuesPlus.insert(U);
5196 } else
5197 Stores.push_back(cast<StoreInst>(&I));
5198 }
5199 }
5200
5201 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5202 return LoadedValuesPlus.contains(SI->getOperand(0));
5203 }))
5204 return;
5205
5206 UP.Runtime = true;
5207 UP.DefaultUnrollRuntimeCount = BestUC;
5208 return;
5209 }
5210
5211 // Try to runtime-unroll loops with early-continues depending on loop-varying
5212 // loads; this helps with branch-prediction for the early-continues.
5213 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5215 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5216 !llvm::is_contained(Preds, Header) ||
5217 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5218 return;
5219
5220 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5221 [&](Instruction *I, unsigned Depth) -> bool {
5222 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5223 return false;
5224
5225 if (isa<LoadInst>(I))
5226 return true;
5227
5228 return any_of(I->operands(), [&](Value *V) {
5229 auto *I = dyn_cast<Instruction>(V);
5230 return I && DependsOnLoopLoad(I, Depth + 1);
5231 });
5232 };
5233 CmpPredicate Pred;
5234 Instruction *I;
5235 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5236 m_Value())) &&
5237 DependsOnLoopLoad(I, 0)) {
5238 UP.Runtime = true;
5239 }
5240}
5241
5244 OptimizationRemarkEmitter *ORE) const {
5245 // Enable partial unrolling and runtime unrolling.
5246 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5247
5248 UP.UpperBound = true;
5249
5250 // For inner loop, it is more likely to be a hot one, and the runtime check
5251 // can be promoted out from LICM pass, so the overhead is less, let's try
5252 // a larger threshold to unroll more loops.
5253 if (L->getLoopDepth() > 1)
5254 UP.PartialThreshold *= 2;
5255
5256 // Disable partial & runtime unrolling on -Os.
5258
5259 // Scan the loop: don't unroll loops with calls as this could prevent
5260 // inlining. Don't unroll auto-vectorized loops either, though do allow
5261 // unrolling of the scalar remainder.
5262 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5264 for (auto *BB : L->getBlocks()) {
5265 for (auto &I : *BB) {
5266 // Both auto-vectorized loops and the scalar remainder have the
5267 // isvectorized attribute, so differentiate between them by the presence
5268 // of vector instructions.
5269 if (IsVectorized && I.getType()->isVectorTy())
5270 return;
5271 if (isa<CallBase>(I)) {
5274 if (!isLoweredToCall(F))
5275 continue;
5276 return;
5277 }
5278
5279 SmallVector<const Value *, 4> Operands(I.operand_values());
5280 Cost += getInstructionCost(&I, Operands,
5282 }
5283 }
5284
5285 // Apply subtarget-specific unrolling preferences.
5286 switch (ST->getProcFamily()) {
5287 case AArch64Subtarget::AppleA14:
5288 case AArch64Subtarget::AppleA15:
5289 case AArch64Subtarget::AppleA16:
5290 case AArch64Subtarget::AppleM4:
5291 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5292 break;
5293 case AArch64Subtarget::Falkor:
5296 break;
5297 default:
5298 break;
5299 }
5300
5301 // If this is a small, multi-exit loop similar to something like std::find,
5302 // then there is typically a performance improvement achieved by unrolling.
5303 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5304 UP.RuntimeUnrollMultiExit = true;
5305 UP.Runtime = true;
5306 // Limit unroll count.
5308 // Allow slightly more costly trip-count expansion to catch search loops
5309 // with pointer inductions.
5310 UP.SCEVExpansionBudget = 5;
5311 return;
5312 }
5313
5314 // Enable runtime unrolling for in-order models
5315 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5316 // checking for that case, we can ensure that the default behaviour is
5317 // unchanged
5318 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5319 !ST->getSchedModel().isOutOfOrder()) {
5320 UP.Runtime = true;
5321 UP.Partial = true;
5322 UP.UnrollRemainder = true;
5324
5325 UP.UnrollAndJam = true;
5327 }
5328
5329 // Force unrolling small loops can be very useful because of the branch
5330 // taken cost of the backedge.
5332 UP.Force = true;
5333}
5334
5339
5341 Type *ExpectedType,
5342 bool CanCreate) const {
5343 switch (Inst->getIntrinsicID()) {
5344 default:
5345 return nullptr;
5346 case Intrinsic::aarch64_neon_st2:
5347 case Intrinsic::aarch64_neon_st3:
5348 case Intrinsic::aarch64_neon_st4: {
5349 // Create a struct type
5350 StructType *ST = dyn_cast<StructType>(ExpectedType);
5351 if (!CanCreate || !ST)
5352 return nullptr;
5353 unsigned NumElts = Inst->arg_size() - 1;
5354 if (ST->getNumElements() != NumElts)
5355 return nullptr;
5356 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5357 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5358 return nullptr;
5359 }
5360 Value *Res = PoisonValue::get(ExpectedType);
5361 IRBuilder<> Builder(Inst);
5362 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5363 Value *L = Inst->getArgOperand(i);
5364 Res = Builder.CreateInsertValue(Res, L, i);
5365 }
5366 return Res;
5367 }
5368 case Intrinsic::aarch64_neon_ld2:
5369 case Intrinsic::aarch64_neon_ld3:
5370 case Intrinsic::aarch64_neon_ld4:
5371 if (Inst->getType() == ExpectedType)
5372 return Inst;
5373 return nullptr;
5374 }
5375}
5376
5378 MemIntrinsicInfo &Info) const {
5379 switch (Inst->getIntrinsicID()) {
5380 default:
5381 break;
5382 case Intrinsic::aarch64_neon_ld2:
5383 case Intrinsic::aarch64_neon_ld3:
5384 case Intrinsic::aarch64_neon_ld4:
5385 Info.ReadMem = true;
5386 Info.WriteMem = false;
5387 Info.PtrVal = Inst->getArgOperand(0);
5388 break;
5389 case Intrinsic::aarch64_neon_st2:
5390 case Intrinsic::aarch64_neon_st3:
5391 case Intrinsic::aarch64_neon_st4:
5392 Info.ReadMem = false;
5393 Info.WriteMem = true;
5394 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5395 break;
5396 }
5397
5398 switch (Inst->getIntrinsicID()) {
5399 default:
5400 return false;
5401 case Intrinsic::aarch64_neon_ld2:
5402 case Intrinsic::aarch64_neon_st2:
5403 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5404 break;
5405 case Intrinsic::aarch64_neon_ld3:
5406 case Intrinsic::aarch64_neon_st3:
5407 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5408 break;
5409 case Intrinsic::aarch64_neon_ld4:
5410 case Intrinsic::aarch64_neon_st4:
5411 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5412 break;
5413 }
5414 return true;
5415}
5416
5417/// See if \p I should be considered for address type promotion. We check if \p
5418/// I is a sext with right type and used in memory accesses. If it used in a
5419/// "complex" getelementptr, we allow it to be promoted without finding other
5420/// sext instructions that sign extended the same initial value. A getelementptr
5421/// is considered as "complex" if it has more than 2 operands.
5423 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5424 bool Considerable = false;
5425 AllowPromotionWithoutCommonHeader = false;
5426 if (!isa<SExtInst>(&I))
5427 return false;
5428 Type *ConsideredSExtType =
5429 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5430 if (I.getType() != ConsideredSExtType)
5431 return false;
5432 // See if the sext is the one with the right type and used in at least one
5433 // GetElementPtrInst.
5434 for (const User *U : I.users()) {
5435 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5436 Considerable = true;
5437 // A getelementptr is considered as "complex" if it has more than 2
5438 // operands. We will promote a SExt used in such complex GEP as we
5439 // expect some computation to be merged if they are done on 64 bits.
5440 if (GEPInst->getNumOperands() > 2) {
5441 AllowPromotionWithoutCommonHeader = true;
5442 break;
5443 }
5444 }
5445 }
5446 return Considerable;
5447}
5448
5450 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5451 if (!VF.isScalable())
5452 return true;
5453
5454 Type *Ty = RdxDesc.getRecurrenceType();
5455 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5456 return false;
5457
5458 switch (RdxDesc.getRecurrenceKind()) {
5459 case RecurKind::Sub:
5461 case RecurKind::Add:
5462 case RecurKind::FAdd:
5463 case RecurKind::And:
5464 case RecurKind::Or:
5465 case RecurKind::Xor:
5466 case RecurKind::SMin:
5467 case RecurKind::SMax:
5468 case RecurKind::UMin:
5469 case RecurKind::UMax:
5470 case RecurKind::FMin:
5471 case RecurKind::FMax:
5472 case RecurKind::FMulAdd:
5473 case RecurKind::AnyOf:
5474 return true;
5475 default:
5476 return false;
5477 }
5478}
5479
5482 FastMathFlags FMF,
5484 // The code-generator is currently not able to handle scalable vectors
5485 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5486 // it. This change will be removed when code-generation for these types is
5487 // sufficiently reliable.
5488 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5489 if (VTy->getElementCount() == ElementCount::getScalable(1))
5491
5492 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5493
5494 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5495 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5496
5497 InstructionCost LegalizationCost = 0;
5498 if (LT.first > 1) {
5499 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5500 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5501 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5502 }
5503
5504 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5505}
5506
5508 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5509 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5510 InstructionCost LegalizationCost = 0;
5511 if (LT.first > 1) {
5512 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5513 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5514 LegalizationCost *= LT.first - 1;
5515 }
5516
5517 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5518 assert(ISD && "Invalid opcode");
5519 // Add the final reduction cost for the legal horizontal reduction
5520 switch (ISD) {
5521 case ISD::ADD:
5522 case ISD::AND:
5523 case ISD::OR:
5524 case ISD::XOR:
5525 case ISD::FADD:
5526 return LegalizationCost + 2;
5527 default:
5529 }
5530}
5531
5534 std::optional<FastMathFlags> FMF,
5536 // The code-generator is currently not able to handle scalable vectors
5537 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5538 // it. This change will be removed when code-generation for these types is
5539 // sufficiently reliable.
5540 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5541 if (VTy->getElementCount() == ElementCount::getScalable(1))
5543
5545 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5546 InstructionCost BaseCost =
5547 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5548 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5549 // end up vectorizing for more computationally intensive loops.
5550 return BaseCost + FixedVTy->getNumElements();
5551 }
5552
5553 if (Opcode != Instruction::FAdd)
5555
5556 auto *VTy = cast<ScalableVectorType>(ValTy);
5558 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5559 Cost *= getMaxNumElements(VTy->getElementCount());
5560 return Cost;
5561 }
5562
5563 if (isa<ScalableVectorType>(ValTy))
5564 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5565
5566 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5567 MVT MTy = LT.second;
5568 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5569 assert(ISD && "Invalid opcode");
5570
5571 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5572 // instructions as twice a normal vector add, plus 1 for each legalization
5573 // step (LT.first). This is the only arithmetic vector reduction operation for
5574 // which we have an instruction.
5575 // OR, XOR and AND costs should match the codegen from:
5576 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5577 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5578 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5579 static const CostTblEntry CostTblNoPairwise[]{
5580 {ISD::ADD, MVT::v8i8, 2},
5581 {ISD::ADD, MVT::v16i8, 2},
5582 {ISD::ADD, MVT::v4i16, 2},
5583 {ISD::ADD, MVT::v8i16, 2},
5584 {ISD::ADD, MVT::v2i32, 2},
5585 {ISD::ADD, MVT::v4i32, 2},
5586 {ISD::ADD, MVT::v2i64, 2},
5587 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5588 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5589 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5590 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5591 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5592 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5593 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5594 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5595 {ISD::XOR, MVT::v16i8, 7},
5596 {ISD::XOR, MVT::v4i16, 4},
5597 {ISD::XOR, MVT::v8i16, 6},
5598 {ISD::XOR, MVT::v2i32, 3},
5599 {ISD::XOR, MVT::v4i32, 5},
5600 {ISD::XOR, MVT::v2i64, 3},
5601 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5602 {ISD::AND, MVT::v16i8, 7},
5603 {ISD::AND, MVT::v4i16, 4},
5604 {ISD::AND, MVT::v8i16, 6},
5605 {ISD::AND, MVT::v2i32, 3},
5606 {ISD::AND, MVT::v4i32, 5},
5607 {ISD::AND, MVT::v2i64, 3},
5608 };
5609 switch (ISD) {
5610 default:
5611 break;
5612 case ISD::FADD:
5613 if (Type *EltTy = ValTy->getScalarType();
5614 // FIXME: For half types without fullfp16 support, this could extend and
5615 // use a fp32 faddp reduction but current codegen unrolls.
5616 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5617 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5618 const unsigned NElts = MTy.getVectorNumElements();
5619 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5620 isPowerOf2_32(NElts))
5621 // Reduction corresponding to series of fadd instructions is lowered to
5622 // series of faddp instructions. faddp has latency/throughput that
5623 // matches fadd instruction and hence, every faddp instruction can be
5624 // considered to have a relative cost = 1 with
5625 // CostKind = TCK_RecipThroughput.
5626 // An faddp will pairwise add vector elements, so the size of input
5627 // vector reduces by half every time, requiring
5628 // #(faddp instructions) = log2_32(NElts).
5629 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5630 }
5631 break;
5632 case ISD::ADD:
5633 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5634 return (LT.first - 1) + Entry->Cost;
5635 break;
5636 case ISD::XOR:
5637 case ISD::AND:
5638 case ISD::OR:
5639 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5640 if (!Entry)
5641 break;
5642 auto *ValVTy = cast<FixedVectorType>(ValTy);
5643 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5644 isPowerOf2_32(ValVTy->getNumElements())) {
5645 InstructionCost ExtraCost = 0;
5646 if (LT.first != 1) {
5647 // Type needs to be split, so there is an extra cost of LT.first - 1
5648 // arithmetic ops.
5649 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5650 MTy.getVectorNumElements());
5651 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5652 ExtraCost *= LT.first - 1;
5653 }
5654 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5655 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5656 return Cost + ExtraCost;
5657 }
5658 break;
5659 }
5660 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5661}
5662
5664 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5665 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5666 EVT VecVT = TLI->getValueType(DL, VecTy);
5667 EVT ResVT = TLI->getValueType(DL, ResTy);
5668
5669 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5670 VecVT.getSizeInBits() >= 64) {
5671 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5672
5673 // The legal cases are:
5674 // UADDLV 8/16/32->32
5675 // UADDLP 32->64
5676 unsigned RevVTSize = ResVT.getSizeInBits();
5677 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5678 RevVTSize <= 32) ||
5679 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5680 RevVTSize <= 32) ||
5681 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5682 RevVTSize <= 64))
5683 return (LT.first - 1) * 2 + 2;
5684 }
5685
5686 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5687 CostKind);
5688}
5689
5691AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5692 Type *ResTy, VectorType *VecTy,
5694 EVT VecVT = TLI->getValueType(DL, VecTy);
5695 EVT ResVT = TLI->getValueType(DL, ResTy);
5696
5697 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5698 RedOpcode == Instruction::Add) {
5699 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5700
5701 // The legal cases with dotprod are
5702 // UDOT 8->32
5703 // Which requires an additional uaddv to sum the i32 values.
5704 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5705 ResVT == MVT::i32)
5706 return LT.first + 2;
5707 }
5708
5709 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5710 CostKind);
5711}
5712
5716 static const CostTblEntry ShuffleTbl[] = {
5717 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5718 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5719 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5720 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5721 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5722 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5723 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5724 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5725 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5726 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5727 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5728 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5729 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5730 };
5731
5732 // The code-generator is currently not able to handle scalable vectors
5733 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5734 // it. This change will be removed when code-generation for these types is
5735 // sufficiently reliable.
5738
5739 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5740 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5741 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5742 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5743 : LT.second;
5744 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5745 InstructionCost LegalizationCost = 0;
5746 if (Index < 0) {
5747 LegalizationCost =
5748 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5750 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5752 }
5753
5754 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5755 // Cost performed on a promoted type.
5756 if (LT.second.getScalarType() == MVT::i1) {
5757 LegalizationCost +=
5758 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5760 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5762 }
5763 const auto *Entry =
5764 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5765 assert(Entry && "Illegal Type for Splice");
5766 LegalizationCost += Entry->Cost;
5767 return LegalizationCost * LT.first;
5768}
5769
5771 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5773 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5776
5778 return Invalid;
5779
5780 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5781 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5782 return Invalid;
5783
5784 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5785 OpAExtend == TTI::PR_None)
5786 return Invalid;
5787
5788 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5789 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5790 "Unexpected values for OpBExtend or InputTypeB");
5791
5792 // We only support multiply binary operations for now, and for muls we
5793 // require the types being extended to be the same.
5794 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5795 return Invalid;
5796
5797 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5798 if (IsUSDot && !ST->hasMatMulInt8())
5799 return Invalid;
5800
5801 unsigned Ratio =
5802 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5803 if (VF.getKnownMinValue() <= Ratio)
5804 return Invalid;
5805
5806 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5807 VectorType *AccumVectorType =
5808 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5809 // We don't yet support all kinds of legalization.
5810 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5811 EVT::getEVT(AccumVectorType));
5812 switch (TC.first) {
5813 default:
5814 return Invalid;
5818 // The legalised type (e.g. after splitting) must be legal too.
5819 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5821 return Invalid;
5822 break;
5823 }
5824
5825 std::pair<InstructionCost, MVT> AccumLT =
5826 getTypeLegalizationCost(AccumVectorType);
5827 std::pair<InstructionCost, MVT> InputLT =
5828 getTypeLegalizationCost(InputVectorType);
5829
5830 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5831
5832 // Prefer using full types by costing half-full input types as more expensive.
5833 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5835 // FIXME: This can be removed after the cost of the extends are folded into
5836 // the dot-product expression in VPlan, after landing:
5837 // https://github.com/llvm/llvm-project/pull/147302
5838 Cost *= 2;
5839
5840 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5841 // i16 -> i64 is natively supported for udot/sdot
5842 if (AccumLT.second.getScalarType() == MVT::i64 &&
5843 InputLT.second.getScalarType() == MVT::i16)
5844 return Cost;
5845 // i8 -> i64 is supported with an extra level of extends
5846 if (AccumLT.second.getScalarType() == MVT::i64 &&
5847 InputLT.second.getScalarType() == MVT::i8)
5848 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5849 // because it requires two extra extends on the inputs. But if we'd change
5850 // that now, a regular reduction would be cheaper because the costs of
5851 // the extends in the IR are still counted. This can be fixed
5852 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5853 return Cost;
5854 }
5855
5856 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5857 if (ST->isSVEorStreamingSVEAvailable() ||
5858 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5859 ST->hasDotProd())) {
5860 if (AccumLT.second.getScalarType() == MVT::i32 &&
5861 InputLT.second.getScalarType() == MVT::i8)
5862 return Cost;
5863 }
5864
5865 // Add additional cost for the extends that would need to be inserted.
5866 return Cost + 2;
5867}
5868
5871 VectorType *SrcTy, ArrayRef<int> Mask,
5872 TTI::TargetCostKind CostKind, int Index,
5874 const Instruction *CxtI) const {
5875 assert((Mask.empty() || DstTy->isScalableTy() ||
5876 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5877 "Expected the Mask to match the return size if given");
5878 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5879 "Expected the same scalar types");
5880 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5881
5882 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5883 // into smaller vectors and sum the cost of each shuffle.
5884 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5885 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5886 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5887 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5888 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5889 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5890 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5891 // cost than just the load.
5892 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5895 return std::max<InstructionCost>(1, LT.first / 4);
5896
5897 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5898 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5899 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5900 // cost than just the store.
5901 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5903 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5905 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5906 return LT.first;
5907
5908 unsigned TpNumElts = Mask.size();
5909 unsigned LTNumElts = LT.second.getVectorNumElements();
5910 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5911 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5912 LT.second.getVectorElementCount());
5914 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5915 PreviousCosts;
5916 for (unsigned N = 0; N < NumVecs; N++) {
5917 SmallVector<int> NMask;
5918 // Split the existing mask into chunks of size LTNumElts. Track the source
5919 // sub-vectors to ensure the result has at most 2 inputs.
5920 unsigned Source1 = -1U, Source2 = -1U;
5921 unsigned NumSources = 0;
5922 for (unsigned E = 0; E < LTNumElts; E++) {
5923 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5925 if (MaskElt < 0) {
5927 continue;
5928 }
5929
5930 // Calculate which source from the input this comes from and whether it
5931 // is new to us.
5932 unsigned Source = MaskElt / LTNumElts;
5933 if (NumSources == 0) {
5934 Source1 = Source;
5935 NumSources = 1;
5936 } else if (NumSources == 1 && Source != Source1) {
5937 Source2 = Source;
5938 NumSources = 2;
5939 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5940 NumSources++;
5941 }
5942
5943 // Add to the new mask. For the NumSources>2 case these are not correct,
5944 // but are only used for the modular lane number.
5945 if (Source == Source1)
5946 NMask.push_back(MaskElt % LTNumElts);
5947 else if (Source == Source2)
5948 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5949 else
5950 NMask.push_back(MaskElt % LTNumElts);
5951 }
5952 // Check if we have already generated this sub-shuffle, which means we
5953 // will have already generated the output. For example a <16 x i32> splat
5954 // will be the same sub-splat 4 times, which only needs to be generated
5955 // once and reused.
5956 auto Result =
5957 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5958 // Check if it was already in the map (already costed).
5959 if (!Result.second)
5960 continue;
5961 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5962 // getShuffleCost. If not then cost it using the worst case as the number
5963 // of element moves into a new vector.
5964 InstructionCost NCost =
5965 NumSources <= 2
5966 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5968 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5969 CxtI)
5970 : LTNumElts;
5971 Result.first->second = NCost;
5972 Cost += NCost;
5973 }
5974 return Cost;
5975 }
5976
5977 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5978 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5979 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
5980 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
5981 // This currently only handles low or high extracts to prevent SLP vectorizer
5982 // regressions.
5983 // Note that SVE's ext instruction is destructive, but it can be fused with
5984 // a movprfx to act like a constructive instruction.
5985 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5986 if (LT.second.getFixedSizeInBits() >= 128 &&
5987 cast<FixedVectorType>(SubTp)->getNumElements() ==
5988 LT.second.getVectorNumElements() / 2) {
5989 if (Index == 0)
5990 return 0;
5991 if (Index == (int)LT.second.getVectorNumElements() / 2)
5992 return 1;
5993 }
5995 }
5996 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5997 // the code to handle length-changing shuffles.
5998 if (Kind == TTI::SK_InsertSubvector) {
5999 LT = getTypeLegalizationCost(DstTy);
6000 SrcTy = DstTy;
6001 }
6002
6003 // Check for identity masks, which we can treat as free for both fixed and
6004 // scalable vector paths.
6005 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6006 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6007 all_of(enumerate(Mask), [](const auto &M) {
6008 return M.value() < 0 || M.value() == (int)M.index();
6009 }))
6010 return 0;
6011
6012 // Segmented shuffle matching.
6013 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6014 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6015 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6017
6019 unsigned Segments =
6021 unsigned SegmentElts = VTy->getNumElements() / Segments;
6022
6023 // dupq zd.t, zn.t[idx]
6024 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6025 ST->isSVEorStreamingSVEAvailable() &&
6026 isDUPQMask(Mask, Segments, SegmentElts))
6027 return LT.first;
6028
6029 // mov zd.q, vn
6030 if (ST->isSVEorStreamingSVEAvailable() &&
6031 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6032 return LT.first;
6033 }
6034
6035 // Check for broadcast loads, which are supported by the LD1R instruction.
6036 // In terms of code-size, the shuffle vector is free when a load + dup get
6037 // folded into a LD1R. That's what we check and return here. For performance
6038 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6039 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6040 // that we model the load + dup sequence slightly higher because LD1R is a
6041 // high latency instruction.
6042 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6043 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6044 if (IsLoad && LT.second.isVector() &&
6045 isLegalBroadcastLoad(SrcTy->getElementType(),
6046 LT.second.getVectorElementCount()))
6047 return 0;
6048 }
6049
6050 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6051 // from the perfect shuffle tables.
6052 if (Mask.size() == 4 &&
6053 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6054 (SrcTy->getScalarSizeInBits() == 16 ||
6055 SrcTy->getScalarSizeInBits() == 32) &&
6056 all_of(Mask, [](int E) { return E < 8; }))
6057 return getPerfectShuffleCost(Mask);
6058
6059 // Check for other shuffles that are not SK_ kinds but we have native
6060 // instructions for, for example ZIP and UZP.
6061 unsigned Unused;
6062 if (LT.second.isFixedLengthVector() &&
6063 LT.second.getVectorNumElements() == Mask.size() &&
6064 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6065 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6066 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6067 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6068 LT.second.getVectorNumElements(), 16) ||
6069 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6070 LT.second.getVectorNumElements(), 32) ||
6071 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6072 LT.second.getVectorNumElements(), 64) ||
6073 // Check for non-zero lane splats
6074 all_of(drop_begin(Mask),
6075 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6076 return 1;
6077
6078 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6079 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6080 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6081 static const CostTblEntry ShuffleTbl[] = {
6082 // Broadcast shuffle kinds can be performed with 'dup'.
6083 {TTI::SK_Broadcast, MVT::v8i8, 1},
6084 {TTI::SK_Broadcast, MVT::v16i8, 1},
6085 {TTI::SK_Broadcast, MVT::v4i16, 1},
6086 {TTI::SK_Broadcast, MVT::v8i16, 1},
6087 {TTI::SK_Broadcast, MVT::v2i32, 1},
6088 {TTI::SK_Broadcast, MVT::v4i32, 1},
6089 {TTI::SK_Broadcast, MVT::v2i64, 1},
6090 {TTI::SK_Broadcast, MVT::v4f16, 1},
6091 {TTI::SK_Broadcast, MVT::v8f16, 1},
6092 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6093 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6094 {TTI::SK_Broadcast, MVT::v2f32, 1},
6095 {TTI::SK_Broadcast, MVT::v4f32, 1},
6096 {TTI::SK_Broadcast, MVT::v2f64, 1},
6097 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6098 // 'zip1/zip2' instructions.
6099 {TTI::SK_Transpose, MVT::v8i8, 1},
6100 {TTI::SK_Transpose, MVT::v16i8, 1},
6101 {TTI::SK_Transpose, MVT::v4i16, 1},
6102 {TTI::SK_Transpose, MVT::v8i16, 1},
6103 {TTI::SK_Transpose, MVT::v2i32, 1},
6104 {TTI::SK_Transpose, MVT::v4i32, 1},
6105 {TTI::SK_Transpose, MVT::v2i64, 1},
6106 {TTI::SK_Transpose, MVT::v4f16, 1},
6107 {TTI::SK_Transpose, MVT::v8f16, 1},
6108 {TTI::SK_Transpose, MVT::v4bf16, 1},
6109 {TTI::SK_Transpose, MVT::v8bf16, 1},
6110 {TTI::SK_Transpose, MVT::v2f32, 1},
6111 {TTI::SK_Transpose, MVT::v4f32, 1},
6112 {TTI::SK_Transpose, MVT::v2f64, 1},
6113 // Select shuffle kinds.
6114 // TODO: handle vXi8/vXi16.
6115 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6116 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6117 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6118 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6119 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6120 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6121 // PermuteSingleSrc shuffle kinds.
6122 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6123 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6124 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6125 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6126 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6127 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6128 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6129 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6130 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6131 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6132 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6133 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6134 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6135 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6136 // Reverse can be lowered with `rev`.
6137 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6138 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6139 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6140 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6141 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6142 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6143 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6144 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6145 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6146 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6147 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6148 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6149 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6150 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6151 // Splice can all be lowered as `ext`.
6152 {TTI::SK_Splice, MVT::v2i32, 1},
6153 {TTI::SK_Splice, MVT::v4i32, 1},
6154 {TTI::SK_Splice, MVT::v2i64, 1},
6155 {TTI::SK_Splice, MVT::v2f32, 1},
6156 {TTI::SK_Splice, MVT::v4f32, 1},
6157 {TTI::SK_Splice, MVT::v2f64, 1},
6158 {TTI::SK_Splice, MVT::v8f16, 1},
6159 {TTI::SK_Splice, MVT::v8bf16, 1},
6160 {TTI::SK_Splice, MVT::v8i16, 1},
6161 {TTI::SK_Splice, MVT::v16i8, 1},
6162 {TTI::SK_Splice, MVT::v4f16, 1},
6163 {TTI::SK_Splice, MVT::v4bf16, 1},
6164 {TTI::SK_Splice, MVT::v4i16, 1},
6165 {TTI::SK_Splice, MVT::v8i8, 1},
6166 // Broadcast shuffle kinds for scalable vectors
6167 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6168 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6169 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6170 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6171 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6172 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6173 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6174 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6175 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6176 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6177 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6178 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6179 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6180 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6181 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6182 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6183 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6184 // Handle the cases for vector.reverse with scalable vectors
6185 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6186 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6187 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6188 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6189 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6190 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6191 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6192 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6193 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6194 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6195 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6196 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6197 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6198 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6199 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6200 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6201 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6202 };
6203 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6204 return LT.first * Entry->Cost;
6205 }
6206
6207 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6208 return getSpliceCost(SrcTy, Index, CostKind);
6209
6210 // Inserting a subvector can often be done with either a D, S or H register
6211 // move, so long as the inserted vector is "aligned".
6212 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6213 LT.second.getSizeInBits() <= 128 && SubTp) {
6214 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6215 if (SubLT.second.isVector()) {
6216 int NumElts = LT.second.getVectorNumElements();
6217 int NumSubElts = SubLT.second.getVectorNumElements();
6218 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6219 return SubLT.first;
6220 }
6221 }
6222
6223 // Restore optimal kind.
6224 if (IsExtractSubvector)
6226 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6227 Args, CxtI);
6228}
6229
6232 const DominatorTree &DT) {
6233 const auto &Strides = DenseMap<Value *, const SCEV *>();
6234 for (BasicBlock *BB : TheLoop->blocks()) {
6235 // Scan the instructions in the block and look for addresses that are
6236 // consecutive and decreasing.
6237 for (Instruction &I : *BB) {
6238 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6240 Type *AccessTy = getLoadStoreType(&I);
6241 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6242 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6243 .value_or(0) < 0)
6244 return true;
6245 }
6246 }
6247 }
6248 return false;
6249}
6250
6252 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6254 // For cases like post-LTO vectorization, when we eventually know the trip
6255 // count, epilogue with fixed-width vectorization can be deleted if the trip
6256 // count is less than the epilogue iterations. That's why we prefer
6257 // fixed-width vectorization in epilogue in case of equal costs.
6258 if (IsEpilogue)
6259 return true;
6260 return ST->useFixedOverScalableIfEqualCost();
6261}
6262
6264 return ST->getEpilogueVectorizationMinVF();
6265}
6266
6268 if (!ST->hasSVE())
6269 return false;
6270
6271 // We don't currently support vectorisation with interleaving for SVE - with
6272 // such loops we're better off not using tail-folding. This gives us a chance
6273 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6274 if (TFI->IAI->hasGroups())
6275 return false;
6276
6278 if (TFI->LVL->getReductionVars().size())
6280 if (TFI->LVL->getFixedOrderRecurrences().size())
6282
6283 // We call this to discover whether any load/store pointers in the loop have
6284 // negative strides. This will require extra work to reverse the loop
6285 // predicate, which may be expensive.
6288 *TFI->LVL->getDominatorTree()))
6292
6293 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6294 Required))
6295 return false;
6296
6297 // Don't tail-fold for tight loops where we would be better off interleaving
6298 // with an unpredicated loop.
6299 unsigned NumInsns = 0;
6300 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6301 NumInsns += BB->sizeWithoutDebug();
6302 }
6303
6304 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6305 return NumInsns >= SVETailFoldInsnThreshold;
6306}
6307
6310 StackOffset BaseOffset, bool HasBaseReg,
6311 int64_t Scale, unsigned AddrSpace) const {
6312 // Scaling factors are not free at all.
6313 // Operands | Rt Latency
6314 // -------------------------------------------
6315 // Rt, [Xn, Xm] | 4
6316 // -------------------------------------------
6317 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6318 // Rt, [Xn, Wm, <extend> #imm] |
6320 AM.BaseGV = BaseGV;
6321 AM.BaseOffs = BaseOffset.getFixed();
6322 AM.HasBaseReg = HasBaseReg;
6323 AM.Scale = Scale;
6324 AM.ScalableOffset = BaseOffset.getScalable();
6325 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6326 // Scale represents reg2 * scale, thus account for 1 if
6327 // it is not equal to 0 or 1.
6328 return AM.Scale != 0 && AM.Scale != 1;
6330}
6331
6333 const Instruction *I) const {
6335 // For the binary operators (e.g. or) we need to be more careful than
6336 // selects, here we only transform them if they are already at a natural
6337 // break point in the code - the end of a block with an unconditional
6338 // terminator.
6339 if (I->getOpcode() == Instruction::Or &&
6340 isa<BranchInst>(I->getNextNode()) &&
6341 cast<BranchInst>(I->getNextNode())->isUnconditional())
6342 return true;
6343
6344 if (I->getOpcode() == Instruction::Add ||
6345 I->getOpcode() == Instruction::Sub)
6346 return true;
6347 }
6349}
6350
6353 const TargetTransformInfo::LSRCost &C2) const {
6354 // AArch64 specific here is adding the number of instructions to the
6355 // comparison (though not as the first consideration, as some targets do)
6356 // along with changing the priority of the base additions.
6357 // TODO: Maybe a more nuanced tradeoff between instruction count
6358 // and number of registers? To be investigated at a later date.
6359 if (EnableLSRCostOpt)
6360 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6361 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6362 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6363 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6364
6366}
6367
6368static bool isSplatShuffle(Value *V) {
6369 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6370 return all_equal(Shuf->getShuffleMask());
6371 return false;
6372}
6373
6374/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6375/// or upper half of the vector elements.
6376static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6377 bool AllowSplat = false) {
6378 // Scalable types can't be extract shuffle vectors.
6379 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6380 return false;
6381
6382 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6383 auto *FullTy = FullV->getType();
6384 auto *HalfTy = HalfV->getType();
6385 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6386 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6387 };
6388
6389 auto extractHalf = [](Value *FullV, Value *HalfV) {
6390 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6391 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6392 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6393 };
6394
6395 ArrayRef<int> M1, M2;
6396 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6397 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6398 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6399 return false;
6400
6401 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6402 // it is not checked as an extract below.
6403 if (AllowSplat && isSplatShuffle(Op1))
6404 S1Op1 = nullptr;
6405 if (AllowSplat && isSplatShuffle(Op2))
6406 S2Op1 = nullptr;
6407
6408 // Check that the operands are half as wide as the result and we extract
6409 // half of the elements of the input vectors.
6410 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6411 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6412 return false;
6413
6414 // Check the mask extracts either the lower or upper half of vector
6415 // elements.
6416 int M1Start = 0;
6417 int M2Start = 0;
6418 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6419 if ((S1Op1 &&
6420 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6421 (S2Op1 &&
6422 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6423 return false;
6424
6425 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6426 (M2Start != 0 && M2Start != (NumElements / 2)))
6427 return false;
6428 if (S1Op1 && S2Op1 && M1Start != M2Start)
6429 return false;
6430
6431 return true;
6432}
6433
6434/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6435/// of the vector elements.
6436static bool areExtractExts(Value *Ext1, Value *Ext2) {
6437 auto areExtDoubled = [](Instruction *Ext) {
6438 return Ext->getType()->getScalarSizeInBits() ==
6439 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6440 };
6441
6442 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6443 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6444 !areExtDoubled(cast<Instruction>(Ext1)) ||
6445 !areExtDoubled(cast<Instruction>(Ext2)))
6446 return false;
6447
6448 return true;
6449}
6450
6451/// Check if Op could be used with vmull_high_p64 intrinsic.
6453 Value *VectorOperand = nullptr;
6454 ConstantInt *ElementIndex = nullptr;
6455 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6456 m_ConstantInt(ElementIndex))) &&
6457 ElementIndex->getValue() == 1 &&
6458 isa<FixedVectorType>(VectorOperand->getType()) &&
6459 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6460}
6461
6462/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6463static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6465}
6466
6468 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6469 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6470 if (!GEP || GEP->getNumOperands() != 2)
6471 return false;
6472
6473 Value *Base = GEP->getOperand(0);
6474 Value *Offsets = GEP->getOperand(1);
6475
6476 // We only care about scalar_base+vector_offsets.
6477 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6478 return false;
6479
6480 // Sink extends that would allow us to use 32-bit offset vectors.
6481 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6482 auto *OffsetsInst = cast<Instruction>(Offsets);
6483 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6484 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6485 Ops.push_back(&GEP->getOperandUse(1));
6486 }
6487
6488 // Sink the GEP.
6489 return true;
6490}
6491
6492/// We want to sink following cases:
6493/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6494/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6496 if (match(Op, m_VScale()))
6497 return true;
6498 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6500 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6501 return true;
6502 }
6503 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6505 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6506 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6507 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6508 return true;
6509 }
6510 return false;
6511}
6512
6513/// Check if sinking \p I's operands to I's basic block is profitable, because
6514/// the operands can be folded into a target instruction, e.g.
6515/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6519 switch (II->getIntrinsicID()) {
6520 case Intrinsic::aarch64_neon_smull:
6521 case Intrinsic::aarch64_neon_umull:
6522 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6523 /*AllowSplat=*/true)) {
6524 Ops.push_back(&II->getOperandUse(0));
6525 Ops.push_back(&II->getOperandUse(1));
6526 return true;
6527 }
6528 [[fallthrough]];
6529
6530 case Intrinsic::fma:
6531 case Intrinsic::fmuladd:
6532 if (isa<VectorType>(I->getType()) &&
6533 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6534 !ST->hasFullFP16())
6535 return false;
6536 [[fallthrough]];
6537 case Intrinsic::aarch64_neon_sqdmull:
6538 case Intrinsic::aarch64_neon_sqdmulh:
6539 case Intrinsic::aarch64_neon_sqrdmulh:
6540 // Sink splats for index lane variants
6541 if (isSplatShuffle(II->getOperand(0)))
6542 Ops.push_back(&II->getOperandUse(0));
6543 if (isSplatShuffle(II->getOperand(1)))
6544 Ops.push_back(&II->getOperandUse(1));
6545 return !Ops.empty();
6546 case Intrinsic::aarch64_neon_fmlal:
6547 case Intrinsic::aarch64_neon_fmlal2:
6548 case Intrinsic::aarch64_neon_fmlsl:
6549 case Intrinsic::aarch64_neon_fmlsl2:
6550 // Sink splats for index lane variants
6551 if (isSplatShuffle(II->getOperand(1)))
6552 Ops.push_back(&II->getOperandUse(1));
6553 if (isSplatShuffle(II->getOperand(2)))
6554 Ops.push_back(&II->getOperandUse(2));
6555 return !Ops.empty();
6556 case Intrinsic::aarch64_sve_ptest_first:
6557 case Intrinsic::aarch64_sve_ptest_last:
6558 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6559 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6560 Ops.push_back(&II->getOperandUse(0));
6561 return !Ops.empty();
6562 case Intrinsic::aarch64_sme_write_horiz:
6563 case Intrinsic::aarch64_sme_write_vert:
6564 case Intrinsic::aarch64_sme_writeq_horiz:
6565 case Intrinsic::aarch64_sme_writeq_vert: {
6566 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6567 if (!Idx || Idx->getOpcode() != Instruction::Add)
6568 return false;
6569 Ops.push_back(&II->getOperandUse(1));
6570 return true;
6571 }
6572 case Intrinsic::aarch64_sme_read_horiz:
6573 case Intrinsic::aarch64_sme_read_vert:
6574 case Intrinsic::aarch64_sme_readq_horiz:
6575 case Intrinsic::aarch64_sme_readq_vert:
6576 case Intrinsic::aarch64_sme_ld1b_vert:
6577 case Intrinsic::aarch64_sme_ld1h_vert:
6578 case Intrinsic::aarch64_sme_ld1w_vert:
6579 case Intrinsic::aarch64_sme_ld1d_vert:
6580 case Intrinsic::aarch64_sme_ld1q_vert:
6581 case Intrinsic::aarch64_sme_st1b_vert:
6582 case Intrinsic::aarch64_sme_st1h_vert:
6583 case Intrinsic::aarch64_sme_st1w_vert:
6584 case Intrinsic::aarch64_sme_st1d_vert:
6585 case Intrinsic::aarch64_sme_st1q_vert:
6586 case Intrinsic::aarch64_sme_ld1b_horiz:
6587 case Intrinsic::aarch64_sme_ld1h_horiz:
6588 case Intrinsic::aarch64_sme_ld1w_horiz:
6589 case Intrinsic::aarch64_sme_ld1d_horiz:
6590 case Intrinsic::aarch64_sme_ld1q_horiz:
6591 case Intrinsic::aarch64_sme_st1b_horiz:
6592 case Intrinsic::aarch64_sme_st1h_horiz:
6593 case Intrinsic::aarch64_sme_st1w_horiz:
6594 case Intrinsic::aarch64_sme_st1d_horiz:
6595 case Intrinsic::aarch64_sme_st1q_horiz: {
6596 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6597 if (!Idx || Idx->getOpcode() != Instruction::Add)
6598 return false;
6599 Ops.push_back(&II->getOperandUse(3));
6600 return true;
6601 }
6602 case Intrinsic::aarch64_neon_pmull:
6603 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6604 return false;
6605 Ops.push_back(&II->getOperandUse(0));
6606 Ops.push_back(&II->getOperandUse(1));
6607 return true;
6608 case Intrinsic::aarch64_neon_pmull64:
6609 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6610 II->getArgOperand(1)))
6611 return false;
6612 Ops.push_back(&II->getArgOperandUse(0));
6613 Ops.push_back(&II->getArgOperandUse(1));
6614 return true;
6615 case Intrinsic::masked_gather:
6616 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6617 return false;
6618 Ops.push_back(&II->getArgOperandUse(0));
6619 return true;
6620 case Intrinsic::masked_scatter:
6621 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6622 return false;
6623 Ops.push_back(&II->getArgOperandUse(1));
6624 return true;
6625 default:
6626 return false;
6627 }
6628 }
6629
6630 auto ShouldSinkCondition = [](Value *Cond,
6631 SmallVectorImpl<Use *> &Ops) -> bool {
6633 return false;
6635 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6636 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6637 return false;
6638 if (isa<CmpInst>(II->getOperand(0)))
6639 Ops.push_back(&II->getOperandUse(0));
6640 return true;
6641 };
6642
6643 switch (I->getOpcode()) {
6644 case Instruction::GetElementPtr:
6645 case Instruction::Add:
6646 case Instruction::Sub:
6647 // Sink vscales closer to uses for better isel
6648 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6649 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6650 Ops.push_back(&I->getOperandUse(Op));
6651 return true;
6652 }
6653 }
6654 break;
6655 case Instruction::Select: {
6656 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6657 return false;
6658
6659 Ops.push_back(&I->getOperandUse(0));
6660 return true;
6661 }
6662 case Instruction::Br: {
6663 if (cast<BranchInst>(I)->isUnconditional())
6664 return false;
6665
6666 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6667 return false;
6668
6669 Ops.push_back(&I->getOperandUse(0));
6670 return true;
6671 }
6672 default:
6673 break;
6674 }
6675
6676 if (!I->getType()->isVectorTy())
6677 return false;
6678
6679 switch (I->getOpcode()) {
6680 case Instruction::Sub:
6681 case Instruction::Add: {
6682 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6683 return false;
6684
6685 // If the exts' operands extract either the lower or upper elements, we
6686 // can sink them too.
6687 auto Ext1 = cast<Instruction>(I->getOperand(0));
6688 auto Ext2 = cast<Instruction>(I->getOperand(1));
6689 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6690 Ops.push_back(&Ext1->getOperandUse(0));
6691 Ops.push_back(&Ext2->getOperandUse(0));
6692 }
6693
6694 Ops.push_back(&I->getOperandUse(0));
6695 Ops.push_back(&I->getOperandUse(1));
6696
6697 return true;
6698 }
6699 case Instruction::Or: {
6700 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6701 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6702 if (ST->hasNEON()) {
6703 Instruction *OtherAnd, *IA, *IB;
6704 Value *MaskValue;
6705 // MainAnd refers to And instruction that has 'Not' as one of its operands
6706 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6707 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6708 m_Instruction(IA)))))) {
6709 if (match(OtherAnd,
6710 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6711 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6712 ? cast<Instruction>(I->getOperand(1))
6713 : cast<Instruction>(I->getOperand(0));
6714
6715 // Both Ands should be in same basic block as Or
6716 if (I->getParent() != MainAnd->getParent() ||
6717 I->getParent() != OtherAnd->getParent())
6718 return false;
6719
6720 // Non-mask operands of both Ands should also be in same basic block
6721 if (I->getParent() != IA->getParent() ||
6722 I->getParent() != IB->getParent())
6723 return false;
6724
6725 Ops.push_back(
6726 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6727 Ops.push_back(&I->getOperandUse(0));
6728 Ops.push_back(&I->getOperandUse(1));
6729
6730 return true;
6731 }
6732 }
6733 }
6734
6735 return false;
6736 }
6737 case Instruction::Mul: {
6738 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6739 auto *Ty = cast<VectorType>(V->getType());
6740 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6741 if (Ty->isScalableTy())
6742 return false;
6743
6744 // Indexed variants of Mul exist for i16 and i32 element types only.
6745 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6746 };
6747
6748 int NumZExts = 0, NumSExts = 0;
6749 for (auto &Op : I->operands()) {
6750 // Make sure we are not already sinking this operand
6751 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6752 continue;
6753
6754 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6755 auto *Ext = cast<Instruction>(Op);
6756 auto *ExtOp = Ext->getOperand(0);
6757 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6758 Ops.push_back(&Ext->getOperandUse(0));
6759 Ops.push_back(&Op);
6760
6761 if (isa<SExtInst>(Ext)) {
6762 NumSExts++;
6763 } else {
6764 NumZExts++;
6765 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6766 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6767 I->getType()->getScalarSizeInBits())
6768 NumSExts++;
6769 }
6770
6771 continue;
6772 }
6773
6775 if (!Shuffle)
6776 continue;
6777
6778 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6779 // operand and the s/zext can help create indexed s/umull. This is
6780 // especially useful to prevent i64 mul being scalarized.
6781 if (isSplatShuffle(Shuffle) &&
6782 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6783 Ops.push_back(&Shuffle->getOperandUse(0));
6784 Ops.push_back(&Op);
6785 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6786 NumSExts++;
6787 else
6788 NumZExts++;
6789 continue;
6790 }
6791
6792 Value *ShuffleOperand = Shuffle->getOperand(0);
6793 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6794 if (!Insert)
6795 continue;
6796
6797 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6798 if (!OperandInstr)
6799 continue;
6800
6801 ConstantInt *ElementConstant =
6802 dyn_cast<ConstantInt>(Insert->getOperand(2));
6803 // Check that the insertelement is inserting into element 0
6804 if (!ElementConstant || !ElementConstant->isZero())
6805 continue;
6806
6807 unsigned Opcode = OperandInstr->getOpcode();
6808 if (Opcode == Instruction::SExt)
6809 NumSExts++;
6810 else if (Opcode == Instruction::ZExt)
6811 NumZExts++;
6812 else {
6813 // If we find that the top bits are known 0, then we can sink and allow
6814 // the backend to generate a umull.
6815 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6816 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6817 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6818 continue;
6819 NumZExts++;
6820 }
6821
6822 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6823 // the And, just to hoist it again back to the load.
6824 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6825 Ops.push_back(&Insert->getOperandUse(1));
6826 Ops.push_back(&Shuffle->getOperandUse(0));
6827 Ops.push_back(&Op);
6828 }
6829
6830 // It is profitable to sink if we found two of the same type of extends.
6831 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6832 return true;
6833
6834 // Otherwise, see if we should sink splats for indexed variants.
6835 if (!ShouldSinkSplatForIndexedVariant(I))
6836 return false;
6837
6838 Ops.clear();
6839 if (isSplatShuffle(I->getOperand(0)))
6840 Ops.push_back(&I->getOperandUse(0));
6841 if (isSplatShuffle(I->getOperand(1)))
6842 Ops.push_back(&I->getOperandUse(1));
6843
6844 return !Ops.empty();
6845 }
6846 case Instruction::FMul: {
6847 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6848 if (I->getType()->isScalableTy())
6849 return false;
6850
6851 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6852 !ST->hasFullFP16())
6853 return false;
6854
6855 // Sink splats for index lane variants
6856 if (isSplatShuffle(I->getOperand(0)))
6857 Ops.push_back(&I->getOperandUse(0));
6858 if (isSplatShuffle(I->getOperand(1)))
6859 Ops.push_back(&I->getOperandUse(1));
6860 return !Ops.empty();
6861 }
6862 default:
6863 return false;
6864 }
6865 return false;
6866}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned countLeadingOnes() const
Definition APInt.h:1625
void negate()
Negate this APInt in place.
Definition APInt.h:1469
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:760
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...