LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::sadd_sat:
655 case Intrinsic::ssub_sat:
656 case Intrinsic::uadd_sat:
657 case Intrinsic::usub_sat: {
658 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
659 MVT::v8i16, MVT::v2i32, MVT::v4i32,
660 MVT::v2i64};
661 auto LT = getTypeLegalizationCost(RetTy);
662 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
663 // need to extend the type, as it uses shr(qadd(shl, shl)).
664 unsigned Instrs =
665 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
666 if (any_of(ValidSatTys, equal_to(LT.second)))
667 return LT.first * Instrs;
668
670 uint64_t VectorSize = TS.getKnownMinValue();
671
672 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
673 return LT.first * Instrs;
674
675 break;
676 }
677 case Intrinsic::abs: {
678 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
679 MVT::v8i16, MVT::v2i32, MVT::v4i32,
680 MVT::v2i64};
681 auto LT = getTypeLegalizationCost(RetTy);
682 if (any_of(ValidAbsTys, equal_to(LT.second)))
683 return LT.first;
684 break;
685 }
686 case Intrinsic::bswap: {
687 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
688 MVT::v4i32, MVT::v2i64};
689 auto LT = getTypeLegalizationCost(RetTy);
690 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
691 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
692 return LT.first;
693 break;
694 }
695 case Intrinsic::fma:
696 case Intrinsic::fmuladd: {
697 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
698 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
699 Type *EltTy = RetTy->getScalarType();
700 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
701 (EltTy->isHalfTy() && ST->hasFullFP16()))
702 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
703 break;
704 }
705 case Intrinsic::stepvector: {
706 InstructionCost Cost = 1; // Cost of the `index' instruction
707 auto LT = getTypeLegalizationCost(RetTy);
708 // Legalisation of illegal vectors involves an `index' instruction plus
709 // (LT.first - 1) vector adds.
710 if (LT.first > 1) {
711 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
712 InstructionCost AddCost =
713 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
714 Cost += AddCost * (LT.first - 1);
715 }
716 return Cost;
717 }
718 case Intrinsic::vector_extract:
719 case Intrinsic::vector_insert: {
720 // If both the vector and subvector types are legal types and the index
721 // is 0, then this should be a no-op or simple operation; return a
722 // relatively low cost.
723
724 // If arguments aren't actually supplied, then we cannot determine the
725 // value of the index. We also want to skip predicate types.
726 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
728 break;
729
730 LLVMContext &C = RetTy->getContext();
731 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
732 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
733 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
734 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
735 // Skip this if either the vector or subvector types are unpacked
736 // SVE types; they may get lowered to stack stores and loads.
737 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
738 break;
739
741 getTLI()->getTypeConversion(C, SubVecVT);
743 getTLI()->getTypeConversion(C, VecVT);
744 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
745 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
746 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
747 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
748 return TTI::TCC_Free;
749 break;
750 }
751 case Intrinsic::bitreverse: {
752 static const CostTblEntry BitreverseTbl[] = {
753 {Intrinsic::bitreverse, MVT::i32, 1},
754 {Intrinsic::bitreverse, MVT::i64, 1},
755 {Intrinsic::bitreverse, MVT::v8i8, 1},
756 {Intrinsic::bitreverse, MVT::v16i8, 1},
757 {Intrinsic::bitreverse, MVT::v4i16, 2},
758 {Intrinsic::bitreverse, MVT::v8i16, 2},
759 {Intrinsic::bitreverse, MVT::v2i32, 2},
760 {Intrinsic::bitreverse, MVT::v4i32, 2},
761 {Intrinsic::bitreverse, MVT::v1i64, 2},
762 {Intrinsic::bitreverse, MVT::v2i64, 2},
763 };
764 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
765 const auto *Entry =
766 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
767 if (Entry) {
768 // Cost Model is using the legal type(i32) that i8 and i16 will be
769 // converted to +1 so that we match the actual lowering cost
770 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
771 TLI->getValueType(DL, RetTy, true) == MVT::i16)
772 return LegalisationCost.first * Entry->Cost + 1;
773
774 return LegalisationCost.first * Entry->Cost;
775 }
776 break;
777 }
778 case Intrinsic::ctpop: {
779 if (!ST->hasNEON()) {
780 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
781 return getTypeLegalizationCost(RetTy).first * 12;
782 }
783 static const CostTblEntry CtpopCostTbl[] = {
784 {ISD::CTPOP, MVT::v2i64, 4},
785 {ISD::CTPOP, MVT::v4i32, 3},
786 {ISD::CTPOP, MVT::v8i16, 2},
787 {ISD::CTPOP, MVT::v16i8, 1},
788 {ISD::CTPOP, MVT::i64, 4},
789 {ISD::CTPOP, MVT::v2i32, 3},
790 {ISD::CTPOP, MVT::v4i16, 2},
791 {ISD::CTPOP, MVT::v8i8, 1},
792 {ISD::CTPOP, MVT::i32, 5},
793 };
794 auto LT = getTypeLegalizationCost(RetTy);
795 MVT MTy = LT.second;
796 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
797 // Extra cost of +1 when illegal vector types are legalized by promoting
798 // the integer type.
799 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
800 RetTy->getScalarSizeInBits()
801 ? 1
802 : 0;
803 return LT.first * Entry->Cost + ExtraCost;
804 }
805 break;
806 }
807 case Intrinsic::sadd_with_overflow:
808 case Intrinsic::uadd_with_overflow:
809 case Intrinsic::ssub_with_overflow:
810 case Intrinsic::usub_with_overflow:
811 case Intrinsic::smul_with_overflow:
812 case Intrinsic::umul_with_overflow: {
813 static const CostTblEntry WithOverflowCostTbl[] = {
814 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
815 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
816 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
817 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
818 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
819 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
820 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
821 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
822 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
823 {Intrinsic::usub_with_overflow, MVT::i8, 3},
824 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
825 {Intrinsic::usub_with_overflow, MVT::i16, 3},
826 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
827 {Intrinsic::usub_with_overflow, MVT::i32, 1},
828 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
829 {Intrinsic::usub_with_overflow, MVT::i64, 1},
830 {Intrinsic::smul_with_overflow, MVT::i8, 5},
831 {Intrinsic::umul_with_overflow, MVT::i8, 4},
832 {Intrinsic::smul_with_overflow, MVT::i16, 5},
833 {Intrinsic::umul_with_overflow, MVT::i16, 4},
834 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
835 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
836 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
837 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
838 };
839 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
840 if (MTy.isSimple())
841 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
842 MTy.getSimpleVT()))
843 return Entry->Cost;
844 break;
845 }
846 case Intrinsic::fptosi_sat:
847 case Intrinsic::fptoui_sat: {
848 if (ICA.getArgTypes().empty())
849 break;
850 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
851 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
852 EVT MTy = TLI->getValueType(DL, RetTy);
853 // Check for the legal types, which are where the size of the input and the
854 // output are the same, or we are using cvt f64->i32 or f32->i64.
855 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
856 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
857 LT.second == MVT::v2f64)) {
858 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
859 (LT.second == MVT::f64 && MTy == MVT::i32) ||
860 (LT.second == MVT::f32 && MTy == MVT::i64)))
861 return LT.first;
862 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
863 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
864 MTy.getScalarSizeInBits() == 64)
865 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
866 }
867 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
868 // f32.
869 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
870 return LT.first + getIntrinsicInstrCost(
871 {ICA.getID(),
872 RetTy,
873 {ICA.getArgTypes()[0]->getWithNewType(
874 Type::getFloatTy(RetTy->getContext()))}},
875 CostKind);
876 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
877 (LT.second == MVT::f16 && MTy == MVT::i64) ||
878 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
879 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
880 return LT.first;
881 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
882 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
883 MTy.getScalarSizeInBits() == 32)
884 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
885 // Extending vector types v8f16->v8i32. These current scalarize but the
886 // codegen could be better.
887 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
888 MTy.getScalarSizeInBits() == 64)
889 return MTy.getVectorNumElements() * 3;
890
891 // If we can we use a legal convert followed by a min+max
892 if ((LT.second.getScalarType() == MVT::f32 ||
893 LT.second.getScalarType() == MVT::f64 ||
894 LT.second.getScalarType() == MVT::f16) &&
895 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
896 Type *LegalTy =
897 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
898 if (LT.second.isVector())
899 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
901 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
902 LegalTy, {LegalTy, LegalTy});
904 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
905 LegalTy, {LegalTy, LegalTy});
907 return LT.first * Cost +
908 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
909 : 1);
910 }
911 // Otherwise we need to follow the default expansion that clamps the value
912 // using a float min/max with a fcmp+sel for nan handling when signed.
913 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
914 RetTy = RetTy->getScalarType();
915 if (LT.second.isVector()) {
916 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
917 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
918 }
919 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
921 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
923 Cost +=
924 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
926 if (IsSigned) {
927 Type *CondTy = RetTy->getWithNewBitWidth(1);
928 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
930 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
932 }
933 return LT.first * Cost;
934 }
935 case Intrinsic::fshl:
936 case Intrinsic::fshr: {
937 if (ICA.getArgs().empty())
938 break;
939
940 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
941
942 // ROTR / ROTL is a funnel shift with equal first and second operand. For
943 // ROTR on integer registers (i32/i64) this can be done in a single ror
944 // instruction. A fshl with a non-constant shift uses a neg + ror.
945 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
946 (RetTy->getPrimitiveSizeInBits() == 32 ||
947 RetTy->getPrimitiveSizeInBits() == 64)) {
948 InstructionCost NegCost =
949 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
950 return 1 + NegCost;
951 }
952
953 // TODO: Add handling for fshl where third argument is not a constant.
954 if (!OpInfoZ.isConstant())
955 break;
956
957 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
958 if (OpInfoZ.isUniform()) {
959 static const CostTblEntry FshlTbl[] = {
960 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
961 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
962 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
963 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
964 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
965 // to avoid having to duplicate the costs.
966 const auto *Entry =
967 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
968 if (Entry)
969 return LegalisationCost.first * Entry->Cost;
970 }
971
972 auto TyL = getTypeLegalizationCost(RetTy);
973 if (!RetTy->isIntegerTy())
974 break;
975
976 // Estimate cost manually, as types like i8 and i16 will get promoted to
977 // i32 and CostTableLookup will ignore the extra conversion cost.
978 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
979 RetTy->getScalarSizeInBits() < 64) ||
980 (RetTy->getScalarSizeInBits() % 64 != 0);
981 unsigned ExtraCost = HigherCost ? 1 : 0;
982 if (RetTy->getScalarSizeInBits() == 32 ||
983 RetTy->getScalarSizeInBits() == 64)
984 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
985 // extr instruction.
986 else if (HigherCost)
987 ExtraCost = 1;
988 else
989 break;
990 return TyL.first + ExtraCost;
991 }
992 case Intrinsic::get_active_lane_mask: {
993 auto RetTy = cast<VectorType>(ICA.getReturnType());
994 EVT RetVT = getTLI()->getValueType(DL, RetTy);
995 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
996 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
997 break;
998
999 if (RetTy->isScalableTy()) {
1000 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1002 break;
1003
1004 auto LT = getTypeLegalizationCost(RetTy);
1005 InstructionCost Cost = LT.first;
1006 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1007 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1008 // nxv32i1 = get_active_lane_mask(base, idx) ->
1009 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1010 if (ST->hasSVE2p1() || ST->hasSME2()) {
1011 Cost /= 2;
1012 if (Cost == 1)
1013 return Cost;
1014 }
1015
1016 // If more than one whilelo intrinsic is required, include the extra cost
1017 // required by the saturating add & select required to increment the
1018 // start value after the first intrinsic call.
1019 Type *OpTy = ICA.getArgTypes()[0];
1020 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1021 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1022 Type *CondTy = OpTy->getWithNewBitWidth(1);
1023 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1025 return Cost + (SplitCost * (Cost - 1));
1026 } else if (!getTLI()->isTypeLegal(RetVT)) {
1027 // We don't have enough context at this point to determine if the mask
1028 // is going to be kept live after the block, which will force the vXi1
1029 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1030 // For now, we just assume the vectorizer created this intrinsic and
1031 // the result will be the input for a PHI. In this case the cost will
1032 // be extremely high for fixed-width vectors.
1033 // NOTE: getScalarizationOverhead returns a cost that's far too
1034 // pessimistic for the actual generated codegen. In reality there are
1035 // two instructions generated per lane.
1036 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1037 }
1038 break;
1039 }
1040 case Intrinsic::experimental_vector_match: {
1041 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1042 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1043 unsigned SearchSize = NeedleTy->getNumElements();
1044 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1045 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1046 // Neoverse V3, these are cheap operations with the same latency as a
1047 // vector ADD. In most cases, however, we also need to do an extra DUP.
1048 // For fixed-length vectors we currently need an extra five--six
1049 // instructions besides the MATCH.
1051 if (isa<FixedVectorType>(RetTy))
1052 Cost += 10;
1053 return Cost;
1054 }
1055 break;
1056 }
1057 case Intrinsic::experimental_cttz_elts: {
1058 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1059 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1060 // This will consist of a SVE brkb and a cntp instruction. These
1061 // typically have the same latency and half the throughput as a vector
1062 // add instruction.
1063 return 4;
1064 }
1065 break;
1066 }
1067 case Intrinsic::loop_dependence_raw_mask:
1068 case Intrinsic::loop_dependence_war_mask: {
1069 // The whilewr/rw instructions require SVE2 or SME.
1070 if (ST->hasSVE2() || ST->hasSME()) {
1071 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1072 unsigned EltSizeInBytes =
1073 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1074 if (is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) &&
1075 VecVT.getVectorMinNumElements() == (16 / EltSizeInBytes))
1076 return 1;
1077 }
1078 break;
1079 }
1080 case Intrinsic::experimental_vector_extract_last_active:
1081 if (ST->isSVEorStreamingSVEAvailable()) {
1082 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1083 // This should turn into chained clastb instructions.
1084 return LegalCost;
1085 }
1086 break;
1087 default:
1088 break;
1089 }
1091}
1092
1093/// The function will remove redundant reinterprets casting in the presence
1094/// of the control flow
1095static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1096 IntrinsicInst &II) {
1098 auto RequiredType = II.getType();
1099
1100 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1101 assert(PN && "Expected Phi Node!");
1102
1103 // Don't create a new Phi unless we can remove the old one.
1104 if (!PN->hasOneUse())
1105 return std::nullopt;
1106
1107 for (Value *IncValPhi : PN->incoming_values()) {
1108 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1109 if (!Reinterpret ||
1110 Reinterpret->getIntrinsicID() !=
1111 Intrinsic::aarch64_sve_convert_to_svbool ||
1112 RequiredType != Reinterpret->getArgOperand(0)->getType())
1113 return std::nullopt;
1114 }
1115
1116 // Create the new Phi
1117 IC.Builder.SetInsertPoint(PN);
1118 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1119 Worklist.push_back(PN);
1120
1121 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1122 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1123 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1124 Worklist.push_back(Reinterpret);
1125 }
1126
1127 // Cleanup Phi Node and reinterprets
1128 return IC.replaceInstUsesWith(II, NPN);
1129}
1130
1131// A collection of properties common to SVE intrinsics that allow for combines
1132// to be written without needing to know the specific intrinsic.
1134 //
1135 // Helper routines for common intrinsic definitions.
1136 //
1137
1138 // e.g. llvm.aarch64.sve.add pg, op1, op2
1139 // with IID ==> llvm.aarch64.sve.add_u
1140 static SVEIntrinsicInfo
1147
1148 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1155
1156 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1162
1163 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1169
1170 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1171 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1172 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1173 return SVEIntrinsicInfo()
1176 }
1177
1178 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1179 // llvm.aarch64.sve.ld1 pg, ptr
1186
1187 // All properties relate to predication and thus having a general predicate
1188 // is the minimum requirement to say there is intrinsic info to act on.
1189 explicit operator bool() const { return hasGoverningPredicate(); }
1190
1191 //
1192 // Properties relating to the governing predicate.
1193 //
1194
1196 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1197 }
1198
1200 assert(hasGoverningPredicate() && "Propery not set!");
1201 return GoverningPredicateIdx;
1202 }
1203
1205 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1206 GoverningPredicateIdx = Index;
1207 return *this;
1208 }
1209
1210 //
1211 // Properties relating to operations the intrinsic could be transformed into.
1212 // NOTE: This does not mean such a transformation is always possible, but the
1213 // knowledge makes it possible to reuse existing optimisations without needing
1214 // to embed specific handling for each intrinsic. For example, instruction
1215 // simplification can be used to optimise an intrinsic's active lanes.
1216 //
1217
1219 return UndefIntrinsic != Intrinsic::not_intrinsic;
1220 }
1221
1223 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1224 return UndefIntrinsic;
1225 }
1226
1228 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1229 UndefIntrinsic = IID;
1230 return *this;
1231 }
1232
1233 bool hasMatchingIROpode() const { return IROpcode != 0; }
1234
1235 unsigned getMatchingIROpode() const {
1236 assert(hasMatchingIROpode() && "Propery not set!");
1237 return IROpcode;
1238 }
1239
1241 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1242 IROpcode = Opcode;
1243 return *this;
1244 }
1245
1246 //
1247 // Properties relating to the result of inactive lanes.
1248 //
1249
1251 return ResultLanes == InactiveLanesTakenFromOperand;
1252 }
1253
1255 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1256 return OperandIdxForInactiveLanes;
1257 }
1258
1260 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1261 ResultLanes = InactiveLanesTakenFromOperand;
1262 OperandIdxForInactiveLanes = Index;
1263 return *this;
1264 }
1265
1267 return ResultLanes == InactiveLanesAreNotDefined;
1268 }
1269
1271 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1272 ResultLanes = InactiveLanesAreNotDefined;
1273 return *this;
1274 }
1275
1277 return ResultLanes == InactiveLanesAreUnused;
1278 }
1279
1281 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1282 ResultLanes = InactiveLanesAreUnused;
1283 return *this;
1284 }
1285
1286 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1287 // inactiveLanesAreZeroed =
1288 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1289 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1290
1292 ResultIsZeroInitialized = true;
1293 return *this;
1294 }
1295
1296 //
1297 // The first operand of unary merging operations is typically only used to
1298 // set the result for inactive lanes. Knowing this allows us to deadcode the
1299 // operand when we can prove there are no inactive lanes.
1300 //
1301
1303 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1304 }
1305
1307 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1308 return OperandIdxWithNoActiveLanes;
1309 }
1310
1312 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1313 OperandIdxWithNoActiveLanes = Index;
1314 return *this;
1315 }
1316
1317private:
1318 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1319
1320 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1321 unsigned IROpcode = 0;
1322
1323 enum PredicationStyle {
1325 InactiveLanesTakenFromOperand,
1326 InactiveLanesAreNotDefined,
1327 InactiveLanesAreUnused
1328 } ResultLanes = Uninitialized;
1329
1330 bool ResultIsZeroInitialized = false;
1331 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1332 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1333};
1334
1336 // Some SVE intrinsics do not use scalable vector types, but since they are
1337 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1338 if (!isa<ScalableVectorType>(II.getType()) &&
1339 all_of(II.args(), [&](const Value *V) {
1340 return !isa<ScalableVectorType>(V->getType());
1341 }))
1342 return SVEIntrinsicInfo();
1343
1344 Intrinsic::ID IID = II.getIntrinsicID();
1345 switch (IID) {
1346 default:
1347 break;
1348 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1349 case Intrinsic::aarch64_sve_fcvt_f16f32:
1350 case Intrinsic::aarch64_sve_fcvt_f16f64:
1351 case Intrinsic::aarch64_sve_fcvt_f32f16:
1352 case Intrinsic::aarch64_sve_fcvt_f32f64:
1353 case Intrinsic::aarch64_sve_fcvt_f64f16:
1354 case Intrinsic::aarch64_sve_fcvt_f64f32:
1355 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1356 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1357 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1358 case Intrinsic::aarch64_sve_fcvtzs:
1359 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1360 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1361 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1362 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1363 case Intrinsic::aarch64_sve_fcvtzu:
1364 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1365 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1366 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1367 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1368 case Intrinsic::aarch64_sve_scvtf:
1369 case Intrinsic::aarch64_sve_scvtf_f16i32:
1370 case Intrinsic::aarch64_sve_scvtf_f16i64:
1371 case Intrinsic::aarch64_sve_scvtf_f32i64:
1372 case Intrinsic::aarch64_sve_scvtf_f64i32:
1373 case Intrinsic::aarch64_sve_ucvtf:
1374 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1375 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1376 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1377 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1379
1380 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1381 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1382 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1383 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1385
1386 case Intrinsic::aarch64_sve_fabd:
1387 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1388 case Intrinsic::aarch64_sve_fadd:
1389 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1390 .setMatchingIROpcode(Instruction::FAdd);
1391 case Intrinsic::aarch64_sve_fdiv:
1392 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1393 .setMatchingIROpcode(Instruction::FDiv);
1394 case Intrinsic::aarch64_sve_fmax:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1396 case Intrinsic::aarch64_sve_fmaxnm:
1397 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1398 case Intrinsic::aarch64_sve_fmin:
1399 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1400 case Intrinsic::aarch64_sve_fminnm:
1401 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1402 case Intrinsic::aarch64_sve_fmla:
1403 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1404 case Intrinsic::aarch64_sve_fmls:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1406 case Intrinsic::aarch64_sve_fmul:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1408 .setMatchingIROpcode(Instruction::FMul);
1409 case Intrinsic::aarch64_sve_fmulx:
1410 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1411 case Intrinsic::aarch64_sve_fnmla:
1412 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1413 case Intrinsic::aarch64_sve_fnmls:
1414 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1415 case Intrinsic::aarch64_sve_fsub:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1417 .setMatchingIROpcode(Instruction::FSub);
1418 case Intrinsic::aarch64_sve_add:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1420 .setMatchingIROpcode(Instruction::Add);
1421 case Intrinsic::aarch64_sve_mla:
1422 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1423 case Intrinsic::aarch64_sve_mls:
1424 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1425 case Intrinsic::aarch64_sve_mul:
1426 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1427 .setMatchingIROpcode(Instruction::Mul);
1428 case Intrinsic::aarch64_sve_sabd:
1429 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1430 case Intrinsic::aarch64_sve_sdiv:
1431 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1432 .setMatchingIROpcode(Instruction::SDiv);
1433 case Intrinsic::aarch64_sve_smax:
1434 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1435 case Intrinsic::aarch64_sve_smin:
1436 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1437 case Intrinsic::aarch64_sve_smulh:
1438 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1439 case Intrinsic::aarch64_sve_sub:
1440 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1441 .setMatchingIROpcode(Instruction::Sub);
1442 case Intrinsic::aarch64_sve_uabd:
1443 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1444 case Intrinsic::aarch64_sve_udiv:
1445 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1446 .setMatchingIROpcode(Instruction::UDiv);
1447 case Intrinsic::aarch64_sve_umax:
1448 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1449 case Intrinsic::aarch64_sve_umin:
1450 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1451 case Intrinsic::aarch64_sve_umulh:
1452 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1453 case Intrinsic::aarch64_sve_asr:
1454 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1455 .setMatchingIROpcode(Instruction::AShr);
1456 case Intrinsic::aarch64_sve_lsl:
1457 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1458 .setMatchingIROpcode(Instruction::Shl);
1459 case Intrinsic::aarch64_sve_lsr:
1460 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1461 .setMatchingIROpcode(Instruction::LShr);
1462 case Intrinsic::aarch64_sve_and:
1463 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1464 .setMatchingIROpcode(Instruction::And);
1465 case Intrinsic::aarch64_sve_bic:
1466 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1467 case Intrinsic::aarch64_sve_eor:
1468 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1469 .setMatchingIROpcode(Instruction::Xor);
1470 case Intrinsic::aarch64_sve_orr:
1471 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1472 .setMatchingIROpcode(Instruction::Or);
1473 case Intrinsic::aarch64_sve_shsub:
1474 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1475 case Intrinsic::aarch64_sve_shsubr:
1477 case Intrinsic::aarch64_sve_sqrshl:
1478 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1479 case Intrinsic::aarch64_sve_sqshl:
1480 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1481 case Intrinsic::aarch64_sve_sqsub:
1482 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1483 case Intrinsic::aarch64_sve_srshl:
1484 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1485 case Intrinsic::aarch64_sve_uhsub:
1486 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1487 case Intrinsic::aarch64_sve_uhsubr:
1489 case Intrinsic::aarch64_sve_uqrshl:
1490 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1491 case Intrinsic::aarch64_sve_uqshl:
1492 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1493 case Intrinsic::aarch64_sve_uqsub:
1494 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1495 case Intrinsic::aarch64_sve_urshl:
1496 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1497
1498 case Intrinsic::aarch64_sve_add_u:
1500 Instruction::Add);
1501 case Intrinsic::aarch64_sve_and_u:
1503 Instruction::And);
1504 case Intrinsic::aarch64_sve_asr_u:
1506 Instruction::AShr);
1507 case Intrinsic::aarch64_sve_eor_u:
1509 Instruction::Xor);
1510 case Intrinsic::aarch64_sve_fadd_u:
1512 Instruction::FAdd);
1513 case Intrinsic::aarch64_sve_fdiv_u:
1515 Instruction::FDiv);
1516 case Intrinsic::aarch64_sve_fmul_u:
1518 Instruction::FMul);
1519 case Intrinsic::aarch64_sve_fsub_u:
1521 Instruction::FSub);
1522 case Intrinsic::aarch64_sve_lsl_u:
1524 Instruction::Shl);
1525 case Intrinsic::aarch64_sve_lsr_u:
1527 Instruction::LShr);
1528 case Intrinsic::aarch64_sve_mul_u:
1530 Instruction::Mul);
1531 case Intrinsic::aarch64_sve_orr_u:
1533 Instruction::Or);
1534 case Intrinsic::aarch64_sve_sdiv_u:
1536 Instruction::SDiv);
1537 case Intrinsic::aarch64_sve_sub_u:
1539 Instruction::Sub);
1540 case Intrinsic::aarch64_sve_udiv_u:
1542 Instruction::UDiv);
1543
1544 case Intrinsic::aarch64_sve_addqv:
1545 case Intrinsic::aarch64_sve_and_z:
1546 case Intrinsic::aarch64_sve_bic_z:
1547 case Intrinsic::aarch64_sve_brka_z:
1548 case Intrinsic::aarch64_sve_brkb_z:
1549 case Intrinsic::aarch64_sve_brkn_z:
1550 case Intrinsic::aarch64_sve_brkpa_z:
1551 case Intrinsic::aarch64_sve_brkpb_z:
1552 case Intrinsic::aarch64_sve_cntp:
1553 case Intrinsic::aarch64_sve_compact:
1554 case Intrinsic::aarch64_sve_eor_z:
1555 case Intrinsic::aarch64_sve_eorv:
1556 case Intrinsic::aarch64_sve_eorqv:
1557 case Intrinsic::aarch64_sve_nand_z:
1558 case Intrinsic::aarch64_sve_nor_z:
1559 case Intrinsic::aarch64_sve_orn_z:
1560 case Intrinsic::aarch64_sve_orr_z:
1561 case Intrinsic::aarch64_sve_orv:
1562 case Intrinsic::aarch64_sve_orqv:
1563 case Intrinsic::aarch64_sve_pnext:
1564 case Intrinsic::aarch64_sve_rdffr_z:
1565 case Intrinsic::aarch64_sve_saddv:
1566 case Intrinsic::aarch64_sve_uaddv:
1567 case Intrinsic::aarch64_sve_umaxv:
1568 case Intrinsic::aarch64_sve_umaxqv:
1569 case Intrinsic::aarch64_sve_cmpeq:
1570 case Intrinsic::aarch64_sve_cmpeq_wide:
1571 case Intrinsic::aarch64_sve_cmpge:
1572 case Intrinsic::aarch64_sve_cmpge_wide:
1573 case Intrinsic::aarch64_sve_cmpgt:
1574 case Intrinsic::aarch64_sve_cmpgt_wide:
1575 case Intrinsic::aarch64_sve_cmphi:
1576 case Intrinsic::aarch64_sve_cmphi_wide:
1577 case Intrinsic::aarch64_sve_cmphs:
1578 case Intrinsic::aarch64_sve_cmphs_wide:
1579 case Intrinsic::aarch64_sve_cmple_wide:
1580 case Intrinsic::aarch64_sve_cmplo_wide:
1581 case Intrinsic::aarch64_sve_cmpls_wide:
1582 case Intrinsic::aarch64_sve_cmplt_wide:
1583 case Intrinsic::aarch64_sve_cmpne:
1584 case Intrinsic::aarch64_sve_cmpne_wide:
1585 case Intrinsic::aarch64_sve_facge:
1586 case Intrinsic::aarch64_sve_facgt:
1587 case Intrinsic::aarch64_sve_fcmpeq:
1588 case Intrinsic::aarch64_sve_fcmpge:
1589 case Intrinsic::aarch64_sve_fcmpgt:
1590 case Intrinsic::aarch64_sve_fcmpne:
1591 case Intrinsic::aarch64_sve_fcmpuo:
1592 case Intrinsic::aarch64_sve_ld1:
1593 case Intrinsic::aarch64_sve_ld1_gather:
1594 case Intrinsic::aarch64_sve_ld1_gather_index:
1595 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1596 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1597 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1598 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1599 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1600 case Intrinsic::aarch64_sve_ld1q_gather_index:
1601 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1602 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1603 case Intrinsic::aarch64_sve_ld1ro:
1604 case Intrinsic::aarch64_sve_ld1rq:
1605 case Intrinsic::aarch64_sve_ld1udq:
1606 case Intrinsic::aarch64_sve_ld1uwq:
1607 case Intrinsic::aarch64_sve_ld2_sret:
1608 case Intrinsic::aarch64_sve_ld2q_sret:
1609 case Intrinsic::aarch64_sve_ld3_sret:
1610 case Intrinsic::aarch64_sve_ld3q_sret:
1611 case Intrinsic::aarch64_sve_ld4_sret:
1612 case Intrinsic::aarch64_sve_ld4q_sret:
1613 case Intrinsic::aarch64_sve_ldff1:
1614 case Intrinsic::aarch64_sve_ldff1_gather:
1615 case Intrinsic::aarch64_sve_ldff1_gather_index:
1616 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1617 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1618 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1619 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1620 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1621 case Intrinsic::aarch64_sve_ldnf1:
1622 case Intrinsic::aarch64_sve_ldnt1:
1623 case Intrinsic::aarch64_sve_ldnt1_gather:
1624 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1625 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1626 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1628
1629 case Intrinsic::aarch64_sve_prf:
1630 case Intrinsic::aarch64_sve_prfb_gather_index:
1631 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1632 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1633 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1634 case Intrinsic::aarch64_sve_prfd_gather_index:
1635 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1636 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1637 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1638 case Intrinsic::aarch64_sve_prfh_gather_index:
1639 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1640 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1641 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1642 case Intrinsic::aarch64_sve_prfw_gather_index:
1643 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1644 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1645 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1647
1648 case Intrinsic::aarch64_sve_st1_scatter:
1649 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1650 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1651 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1652 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1653 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1654 case Intrinsic::aarch64_sve_st1dq:
1655 case Intrinsic::aarch64_sve_st1q_scatter_index:
1656 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1657 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1658 case Intrinsic::aarch64_sve_st1wq:
1659 case Intrinsic::aarch64_sve_stnt1:
1660 case Intrinsic::aarch64_sve_stnt1_scatter:
1661 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1662 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1663 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1665 case Intrinsic::aarch64_sve_st2:
1666 case Intrinsic::aarch64_sve_st2q:
1668 case Intrinsic::aarch64_sve_st3:
1669 case Intrinsic::aarch64_sve_st3q:
1671 case Intrinsic::aarch64_sve_st4:
1672 case Intrinsic::aarch64_sve_st4q:
1674 }
1675
1676 return SVEIntrinsicInfo();
1677}
1678
1679static bool isAllActivePredicate(Value *Pred) {
1680 Value *UncastedPred;
1681
1682 // Look through predicate casts that only remove lanes.
1684 m_Value(UncastedPred)))) {
1685 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1686 Pred = UncastedPred;
1687
1689 m_Value(UncastedPred))))
1690 // If the predicate has the same or less lanes than the uncasted predicate
1691 // then we know the casting has no effect.
1692 if (OrigPredTy->getMinNumElements() <=
1693 cast<ScalableVectorType>(UncastedPred->getType())
1694 ->getMinNumElements())
1695 Pred = UncastedPred;
1696 }
1697
1698 auto *C = dyn_cast<Constant>(Pred);
1699 return C && C->isAllOnesValue();
1700}
1701
1702// Simplify `V` by only considering the operations that affect active lanes.
1703// This function should only return existing Values or newly created Constants.
1704static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1705 auto *Dup = dyn_cast<IntrinsicInst>(V);
1706 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1707 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1709 cast<VectorType>(V->getType())->getElementCount(),
1710 cast<Constant>(Dup->getOperand(2)));
1711
1712 return V;
1713}
1714
1715static std::optional<Instruction *>
1717 const SVEIntrinsicInfo &IInfo) {
1718 const unsigned Opc = IInfo.getMatchingIROpode();
1719 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1720
1721 Value *Pg = II.getOperand(0);
1722 Value *Op1 = II.getOperand(1);
1723 Value *Op2 = II.getOperand(2);
1724 const DataLayout &DL = II.getDataLayout();
1725
1726 // Canonicalise constants to the RHS.
1728 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1729 IC.replaceOperand(II, 1, Op2);
1730 IC.replaceOperand(II, 2, Op1);
1731 return &II;
1732 }
1733
1734 // Only active lanes matter when simplifying the operation.
1735 Op1 = stripInactiveLanes(Op1, Pg);
1736 Op2 = stripInactiveLanes(Op2, Pg);
1737
1738 Value *SimpleII;
1739 if (auto FII = dyn_cast<FPMathOperator>(&II))
1740 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1741 else
1742 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1743
1744 // An SVE intrinsic's result is always defined. However, this is not the case
1745 // for its equivalent IR instruction (e.g. when shifting by an amount more
1746 // than the data's bitwidth). Simplifications to an undefined result must be
1747 // ignored to preserve the intrinsic's expected behaviour.
1748 if (!SimpleII || isa<UndefValue>(SimpleII))
1749 return std::nullopt;
1750
1751 if (IInfo.inactiveLanesAreNotDefined())
1752 return IC.replaceInstUsesWith(II, SimpleII);
1753
1754 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1755
1756 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1757 if (SimpleII == Inactive)
1758 return IC.replaceInstUsesWith(II, SimpleII);
1759
1760 // Inactive lanes must be preserved.
1761 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1762 return IC.replaceInstUsesWith(II, SimpleII);
1763}
1764
1765// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1766// to operations with less strict inactive lane requirements.
1767static std::optional<Instruction *>
1769 const SVEIntrinsicInfo &IInfo) {
1770 if (!IInfo.hasGoverningPredicate())
1771 return std::nullopt;
1772
1773 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1774
1775 // If there are no active lanes.
1776 if (match(OpPredicate, m_ZeroInt())) {
1778 return IC.replaceInstUsesWith(
1779 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1780
1781 if (IInfo.inactiveLanesAreUnused()) {
1782 if (IInfo.resultIsZeroInitialized())
1784
1785 return IC.eraseInstFromFunction(II);
1786 }
1787 }
1788
1789 // If there are no inactive lanes.
1790 if (isAllActivePredicate(OpPredicate)) {
1791 if (IInfo.hasOperandWithNoActiveLanes()) {
1792 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1793 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1794 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1795 }
1796
1797 if (IInfo.hasMatchingUndefIntrinsic()) {
1798 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1799 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1800 II.setCalledFunction(NewDecl);
1801 return &II;
1802 }
1803 }
1804
1805 // Operation specific simplifications.
1806 if (IInfo.hasMatchingIROpode() &&
1808 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1809
1810 return std::nullopt;
1811}
1812
1813// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1814// => (binop (pred) (from_svbool _) (from_svbool _))
1815//
1816// The above transformation eliminates a `to_svbool` in the predicate
1817// operand of bitwise operation `binop` by narrowing the vector width of
1818// the operation. For example, it would convert a `<vscale x 16 x i1>
1819// and` into a `<vscale x 4 x i1> and`. This is profitable because
1820// to_svbool must zero the new lanes during widening, whereas
1821// from_svbool is free.
1822static std::optional<Instruction *>
1824 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1825 if (!BinOp)
1826 return std::nullopt;
1827
1828 auto IntrinsicID = BinOp->getIntrinsicID();
1829 switch (IntrinsicID) {
1830 case Intrinsic::aarch64_sve_and_z:
1831 case Intrinsic::aarch64_sve_bic_z:
1832 case Intrinsic::aarch64_sve_eor_z:
1833 case Intrinsic::aarch64_sve_nand_z:
1834 case Intrinsic::aarch64_sve_nor_z:
1835 case Intrinsic::aarch64_sve_orn_z:
1836 case Intrinsic::aarch64_sve_orr_z:
1837 break;
1838 default:
1839 return std::nullopt;
1840 }
1841
1842 auto BinOpPred = BinOp->getOperand(0);
1843 auto BinOpOp1 = BinOp->getOperand(1);
1844 auto BinOpOp2 = BinOp->getOperand(2);
1845
1846 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1847 if (!PredIntr ||
1848 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1849 return std::nullopt;
1850
1851 auto PredOp = PredIntr->getOperand(0);
1852 auto PredOpTy = cast<VectorType>(PredOp->getType());
1853 if (PredOpTy != II.getType())
1854 return std::nullopt;
1855
1856 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1857 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1858 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1859 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1860 if (BinOpOp1 == BinOpOp2)
1861 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1862 else
1863 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1864 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1865
1866 auto NarrowedBinOp =
1867 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1868 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1869}
1870
1871static std::optional<Instruction *>
1873 // If the reinterpret instruction operand is a PHI Node
1874 if (isa<PHINode>(II.getArgOperand(0)))
1875 return processPhiNode(IC, II);
1876
1877 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1878 return BinOpCombine;
1879
1880 // Ignore converts to/from svcount_t.
1881 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1882 isa<TargetExtType>(II.getType()))
1883 return std::nullopt;
1884
1885 SmallVector<Instruction *, 32> CandidatesForRemoval;
1886 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1887
1888 const auto *IVTy = cast<VectorType>(II.getType());
1889
1890 // Walk the chain of conversions.
1891 while (Cursor) {
1892 // If the type of the cursor has fewer lanes than the final result, zeroing
1893 // must take place, which breaks the equivalence chain.
1894 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1895 if (CursorVTy->getElementCount().getKnownMinValue() <
1896 IVTy->getElementCount().getKnownMinValue())
1897 break;
1898
1899 // If the cursor has the same type as I, it is a viable replacement.
1900 if (Cursor->getType() == IVTy)
1901 EarliestReplacement = Cursor;
1902
1903 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1904
1905 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1906 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1907 Intrinsic::aarch64_sve_convert_to_svbool ||
1908 IntrinsicCursor->getIntrinsicID() ==
1909 Intrinsic::aarch64_sve_convert_from_svbool))
1910 break;
1911
1912 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1913 Cursor = IntrinsicCursor->getOperand(0);
1914 }
1915
1916 // If no viable replacement in the conversion chain was found, there is
1917 // nothing to do.
1918 if (!EarliestReplacement)
1919 return std::nullopt;
1920
1921 return IC.replaceInstUsesWith(II, EarliestReplacement);
1922}
1923
1924static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1925 IntrinsicInst &II) {
1926 // svsel(ptrue, x, y) => x
1927 auto *OpPredicate = II.getOperand(0);
1928 if (isAllActivePredicate(OpPredicate))
1929 return IC.replaceInstUsesWith(II, II.getOperand(1));
1930
1931 auto Select =
1932 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1933 return IC.replaceInstUsesWith(II, Select);
1934}
1935
1936static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1937 IntrinsicInst &II) {
1938 Value *Pg = II.getOperand(1);
1939
1940 // sve.dup(V, all_active, X) ==> splat(X)
1941 if (isAllActivePredicate(Pg)) {
1942 auto *RetTy = cast<ScalableVectorType>(II.getType());
1943 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1944 II.getArgOperand(2));
1945 return IC.replaceInstUsesWith(II, Splat);
1946 }
1947
1949 m_SpecificInt(AArch64SVEPredPattern::vl1))))
1950 return std::nullopt;
1951
1952 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1953 Value *Insert = IC.Builder.CreateInsertElement(
1954 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
1955 return IC.replaceInstUsesWith(II, Insert);
1956}
1957
1958static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1959 IntrinsicInst &II) {
1960 // Replace DupX with a regular IR splat.
1961 auto *RetTy = cast<ScalableVectorType>(II.getType());
1962 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1963 II.getArgOperand(0));
1964 Splat->takeName(&II);
1965 return IC.replaceInstUsesWith(II, Splat);
1966}
1967
1968static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1969 IntrinsicInst &II) {
1970 LLVMContext &Ctx = II.getContext();
1971
1972 if (!isAllActivePredicate(II.getArgOperand(0)))
1973 return std::nullopt;
1974
1975 // Check that we have a compare of zero..
1976 auto *SplatValue =
1978 if (!SplatValue || !SplatValue->isZero())
1979 return std::nullopt;
1980
1981 // ..against a dupq
1982 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1983 if (!DupQLane ||
1984 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1985 return std::nullopt;
1986
1987 // Where the dupq is a lane 0 replicate of a vector insert
1988 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1989 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1990 return std::nullopt;
1991
1992 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1993 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1994 return std::nullopt;
1995
1996 // Where the vector insert is a fixed constant vector insert into undef at
1997 // index zero
1998 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1999 return std::nullopt;
2000
2001 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2002 return std::nullopt;
2003
2004 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2005 if (!ConstVec)
2006 return std::nullopt;
2007
2008 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2009 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2010 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2011 return std::nullopt;
2012
2013 unsigned NumElts = VecTy->getNumElements();
2014 unsigned PredicateBits = 0;
2015
2016 // Expand intrinsic operands to a 16-bit byte level predicate
2017 for (unsigned I = 0; I < NumElts; ++I) {
2018 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2019 if (!Arg)
2020 return std::nullopt;
2021 if (!Arg->isZero())
2022 PredicateBits |= 1 << (I * (16 / NumElts));
2023 }
2024
2025 // If all bits are zero bail early with an empty predicate
2026 if (PredicateBits == 0) {
2027 auto *PFalse = Constant::getNullValue(II.getType());
2028 PFalse->takeName(&II);
2029 return IC.replaceInstUsesWith(II, PFalse);
2030 }
2031
2032 // Calculate largest predicate type used (where byte predicate is largest)
2033 unsigned Mask = 8;
2034 for (unsigned I = 0; I < 16; ++I)
2035 if ((PredicateBits & (1 << I)) != 0)
2036 Mask |= (I % 8);
2037
2038 unsigned PredSize = Mask & -Mask;
2039 auto *PredType = ScalableVectorType::get(
2040 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2041
2042 // Ensure all relevant bits are set
2043 for (unsigned I = 0; I < 16; I += PredSize)
2044 if ((PredicateBits & (1 << I)) == 0)
2045 return std::nullopt;
2046
2047 auto *PTruePat =
2048 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2049 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2050 {PredType}, {PTruePat});
2051 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2052 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2053 auto *ConvertFromSVBool =
2054 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2055 {II.getType()}, {ConvertToSVBool});
2056
2057 ConvertFromSVBool->takeName(&II);
2058 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2059}
2060
2061static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2062 IntrinsicInst &II) {
2063 Value *Pg = II.getArgOperand(0);
2064 Value *Vec = II.getArgOperand(1);
2065 auto IntrinsicID = II.getIntrinsicID();
2066 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2067
2068 // lastX(splat(X)) --> X
2069 if (auto *SplatVal = getSplatValue(Vec))
2070 return IC.replaceInstUsesWith(II, SplatVal);
2071
2072 // If x and/or y is a splat value then:
2073 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2074 Value *LHS, *RHS;
2075 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2076 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2077 auto *OldBinOp = cast<BinaryOperator>(Vec);
2078 auto OpC = OldBinOp->getOpcode();
2079 auto *NewLHS =
2080 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2081 auto *NewRHS =
2082 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2084 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2085 return IC.replaceInstUsesWith(II, NewBinOp);
2086 }
2087 }
2088
2089 auto *C = dyn_cast<Constant>(Pg);
2090 if (IsAfter && C && C->isNullValue()) {
2091 // The intrinsic is extracting lane 0 so use an extract instead.
2092 auto *IdxTy = Type::getInt64Ty(II.getContext());
2093 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2094 Extract->insertBefore(II.getIterator());
2095 Extract->takeName(&II);
2096 return IC.replaceInstUsesWith(II, Extract);
2097 }
2098
2099 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2100 if (!IntrPG)
2101 return std::nullopt;
2102
2103 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2104 return std::nullopt;
2105
2106 const auto PTruePattern =
2107 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2108
2109 // Can the intrinsic's predicate be converted to a known constant index?
2110 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2111 if (!MinNumElts)
2112 return std::nullopt;
2113
2114 unsigned Idx = MinNumElts - 1;
2115 // Increment the index if extracting the element after the last active
2116 // predicate element.
2117 if (IsAfter)
2118 ++Idx;
2119
2120 // Ignore extracts whose index is larger than the known minimum vector
2121 // length. NOTE: This is an artificial constraint where we prefer to
2122 // maintain what the user asked for until an alternative is proven faster.
2123 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2124 if (Idx >= PgVTy->getMinNumElements())
2125 return std::nullopt;
2126
2127 // The intrinsic is extracting a fixed lane so use an extract instead.
2128 auto *IdxTy = Type::getInt64Ty(II.getContext());
2129 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2130 Extract->insertBefore(II.getIterator());
2131 Extract->takeName(&II);
2132 return IC.replaceInstUsesWith(II, Extract);
2133}
2134
2135static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2138 // integer variant across a variety of micro-architectures. Replace scalar
2139 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2140 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2141 // depending on the micro-architecture, but has been observed as generally
2142 // being faster, particularly when the CLAST[AB] op is a loop-carried
2143 // dependency.
2144 Value *Pg = II.getArgOperand(0);
2145 Value *Fallback = II.getArgOperand(1);
2146 Value *Vec = II.getArgOperand(2);
2147 Type *Ty = II.getType();
2148
2149 if (!Ty->isIntegerTy())
2150 return std::nullopt;
2151
2152 Type *FPTy;
2153 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2154 default:
2155 return std::nullopt;
2156 case 16:
2157 FPTy = IC.Builder.getHalfTy();
2158 break;
2159 case 32:
2160 FPTy = IC.Builder.getFloatTy();
2161 break;
2162 case 64:
2163 FPTy = IC.Builder.getDoubleTy();
2164 break;
2165 }
2166
2167 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2168 auto *FPVTy = VectorType::get(
2169 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2170 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2171 auto *FPII = IC.Builder.CreateIntrinsic(
2172 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2173 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2174 return IC.replaceInstUsesWith(II, FPIItoInt);
2175}
2176
2177static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2178 IntrinsicInst &II) {
2179 LLVMContext &Ctx = II.getContext();
2180 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2181 // can work with RDFFR_PP for ptest elimination.
2182 auto *AllPat =
2183 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2184 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2185 {II.getType()}, {AllPat});
2186 auto *RDFFR =
2187 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2188 RDFFR->takeName(&II);
2189 return IC.replaceInstUsesWith(II, RDFFR);
2190}
2191
2192static std::optional<Instruction *>
2194 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2195
2196 if (Pattern == AArch64SVEPredPattern::all) {
2198 II.getType(), ElementCount::getScalable(NumElts));
2199 Cnt->takeName(&II);
2200 return IC.replaceInstUsesWith(II, Cnt);
2201 }
2202
2203 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2204
2205 return MinNumElts && NumElts >= MinNumElts
2206 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2207 II, ConstantInt::get(II.getType(), MinNumElts)))
2208 : std::nullopt;
2209}
2210
2211static std::optional<Instruction *>
2213 const AArch64Subtarget *ST) {
2214 if (!ST->isStreaming())
2215 return std::nullopt;
2216
2217 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2218 // with SVEPredPattern::all
2219 Value *Cnt =
2221 Cnt->takeName(&II);
2222 return IC.replaceInstUsesWith(II, Cnt);
2223}
2224
2225static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2226 IntrinsicInst &II) {
2227 Value *PgVal = II.getArgOperand(0);
2228 Value *OpVal = II.getArgOperand(1);
2229
2230 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2231 // Later optimizations prefer this form.
2232 if (PgVal == OpVal &&
2233 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2234 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2235 Value *Ops[] = {PgVal, OpVal};
2236 Type *Tys[] = {PgVal->getType()};
2237
2238 auto *PTest =
2239 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2240 PTest->takeName(&II);
2241
2242 return IC.replaceInstUsesWith(II, PTest);
2243 }
2244
2247
2248 if (!Pg || !Op)
2249 return std::nullopt;
2250
2251 Intrinsic::ID OpIID = Op->getIntrinsicID();
2252
2253 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2254 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2255 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2256 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2257 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2258
2259 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2260
2261 PTest->takeName(&II);
2262 return IC.replaceInstUsesWith(II, PTest);
2263 }
2264
2265 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2266 // Later optimizations may rewrite sequence to use the flag-setting variant
2267 // of instruction X to remove PTEST.
2268 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2269 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2270 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2271 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2272 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2273 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2274 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2275 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2276 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2277 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2278 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2279 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2280 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2281 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2282 Type *Tys[] = {Pg->getType()};
2283
2284 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2285 PTest->takeName(&II);
2286
2287 return IC.replaceInstUsesWith(II, PTest);
2288 }
2289
2290 return std::nullopt;
2291}
2292
2293template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2294static std::optional<Instruction *>
2296 bool MergeIntoAddendOp) {
2297 Value *P = II.getOperand(0);
2298 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2299 if (MergeIntoAddendOp) {
2300 AddendOp = II.getOperand(1);
2301 Mul = II.getOperand(2);
2302 } else {
2303 AddendOp = II.getOperand(2);
2304 Mul = II.getOperand(1);
2305 }
2306
2308 m_Value(MulOp1))))
2309 return std::nullopt;
2310
2311 if (!Mul->hasOneUse())
2312 return std::nullopt;
2313
2314 Instruction *FMFSource = nullptr;
2315 if (II.getType()->isFPOrFPVectorTy()) {
2316 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2317 // Stop the combine when the flags on the inputs differ in case dropping
2318 // flags would lead to us missing out on more beneficial optimizations.
2319 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2320 return std::nullopt;
2321 if (!FAddFlags.allowContract())
2322 return std::nullopt;
2323 FMFSource = &II;
2324 }
2325
2326 CallInst *Res;
2327 if (MergeIntoAddendOp)
2328 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2329 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2330 else
2331 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2332 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2333
2334 return IC.replaceInstUsesWith(II, Res);
2335}
2336
2337static std::optional<Instruction *>
2339 Value *Pred = II.getOperand(0);
2340 Value *PtrOp = II.getOperand(1);
2341 Type *VecTy = II.getType();
2342
2343 if (isAllActivePredicate(Pred)) {
2344 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2345 Load->copyMetadata(II);
2346 return IC.replaceInstUsesWith(II, Load);
2347 }
2348
2349 CallInst *MaskedLoad =
2350 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2351 Pred, ConstantAggregateZero::get(VecTy));
2352 MaskedLoad->copyMetadata(II);
2353 return IC.replaceInstUsesWith(II, MaskedLoad);
2354}
2355
2356static std::optional<Instruction *>
2358 Value *VecOp = II.getOperand(0);
2359 Value *Pred = II.getOperand(1);
2360 Value *PtrOp = II.getOperand(2);
2361
2362 if (isAllActivePredicate(Pred)) {
2363 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2364 Store->copyMetadata(II);
2365 return IC.eraseInstFromFunction(II);
2366 }
2367
2368 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2369 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2370 MaskedStore->copyMetadata(II);
2371 return IC.eraseInstFromFunction(II);
2372}
2373
2375 switch (Intrinsic) {
2376 case Intrinsic::aarch64_sve_fmul_u:
2377 return Instruction::BinaryOps::FMul;
2378 case Intrinsic::aarch64_sve_fadd_u:
2379 return Instruction::BinaryOps::FAdd;
2380 case Intrinsic::aarch64_sve_fsub_u:
2381 return Instruction::BinaryOps::FSub;
2382 default:
2383 return Instruction::BinaryOpsEnd;
2384 }
2385}
2386
2387static std::optional<Instruction *>
2389 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2390 if (II.isStrictFP())
2391 return std::nullopt;
2392
2393 auto *OpPredicate = II.getOperand(0);
2394 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2395 if (BinOpCode == Instruction::BinaryOpsEnd ||
2396 !isAllActivePredicate(OpPredicate))
2397 return std::nullopt;
2398 auto BinOp = IC.Builder.CreateBinOpFMF(
2399 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2400 return IC.replaceInstUsesWith(II, BinOp);
2401}
2402
2403static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2404 IntrinsicInst &II) {
2405 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2406 Intrinsic::aarch64_sve_mla>(
2407 IC, II, true))
2408 return MLA;
2409 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2410 Intrinsic::aarch64_sve_mad>(
2411 IC, II, false))
2412 return MAD;
2413 return std::nullopt;
2414}
2415
2416static std::optional<Instruction *>
2418 if (auto FMLA =
2419 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2420 Intrinsic::aarch64_sve_fmla>(IC, II,
2421 true))
2422 return FMLA;
2423 if (auto FMAD =
2424 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2425 Intrinsic::aarch64_sve_fmad>(IC, II,
2426 false))
2427 return FMAD;
2428 if (auto FMLA =
2429 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2430 Intrinsic::aarch64_sve_fmla>(IC, II,
2431 true))
2432 return FMLA;
2433 return std::nullopt;
2434}
2435
2436static std::optional<Instruction *>
2438 if (auto FMLA =
2439 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2440 Intrinsic::aarch64_sve_fmla>(IC, II,
2441 true))
2442 return FMLA;
2443 if (auto FMAD =
2444 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2445 Intrinsic::aarch64_sve_fmad>(IC, II,
2446 false))
2447 return FMAD;
2448 if (auto FMLA_U =
2449 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2450 Intrinsic::aarch64_sve_fmla_u>(
2451 IC, II, true))
2452 return FMLA_U;
2453 return instCombineSVEVectorBinOp(IC, II);
2454}
2455
2456static std::optional<Instruction *>
2458 if (auto FMLS =
2459 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2460 Intrinsic::aarch64_sve_fmls>(IC, II,
2461 true))
2462 return FMLS;
2463 if (auto FMSB =
2464 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2465 Intrinsic::aarch64_sve_fnmsb>(
2466 IC, II, false))
2467 return FMSB;
2468 if (auto FMLS =
2469 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2470 Intrinsic::aarch64_sve_fmls>(IC, II,
2471 true))
2472 return FMLS;
2473 return std::nullopt;
2474}
2475
2476static std::optional<Instruction *>
2478 if (auto FMLS =
2479 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2480 Intrinsic::aarch64_sve_fmls>(IC, II,
2481 true))
2482 return FMLS;
2483 if (auto FMSB =
2484 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2485 Intrinsic::aarch64_sve_fnmsb>(
2486 IC, II, false))
2487 return FMSB;
2488 if (auto FMLS_U =
2489 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2490 Intrinsic::aarch64_sve_fmls_u>(
2491 IC, II, true))
2492 return FMLS_U;
2493 return instCombineSVEVectorBinOp(IC, II);
2494}
2495
2496static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2497 IntrinsicInst &II) {
2498 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2499 Intrinsic::aarch64_sve_mls>(
2500 IC, II, true))
2501 return MLS;
2502 return std::nullopt;
2503}
2504
2505static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2506 IntrinsicInst &II) {
2507 Value *UnpackArg = II.getArgOperand(0);
2508 auto *RetTy = cast<ScalableVectorType>(II.getType());
2509 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2510 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2511
2512 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2513 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2514 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2515 ScalarArg =
2516 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2517 Value *NewVal =
2518 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2519 NewVal->takeName(&II);
2520 return IC.replaceInstUsesWith(II, NewVal);
2521 }
2522
2523 return std::nullopt;
2524}
2525static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2526 IntrinsicInst &II) {
2527 auto *OpVal = II.getOperand(0);
2528 auto *OpIndices = II.getOperand(1);
2529 VectorType *VTy = cast<VectorType>(II.getType());
2530
2531 // Check whether OpIndices is a constant splat value < minimal element count
2532 // of result.
2533 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2534 if (!SplatValue ||
2535 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2536 return std::nullopt;
2537
2538 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2539 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2540 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2541 auto *VectorSplat =
2542 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2543
2544 VectorSplat->takeName(&II);
2545 return IC.replaceInstUsesWith(II, VectorSplat);
2546}
2547
2548static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2549 IntrinsicInst &II) {
2550 Value *A, *B;
2551 Type *RetTy = II.getType();
2552 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2553 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2554
2555 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2556 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2557 if ((match(II.getArgOperand(0),
2559 match(II.getArgOperand(1),
2561 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2562 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2563 auto *TyA = cast<ScalableVectorType>(A->getType());
2564 if (TyA == B->getType() &&
2566 auto *SubVec = IC.Builder.CreateInsertVector(
2567 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2568 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2569 TyA->getMinNumElements());
2570 ConcatVec->takeName(&II);
2571 return IC.replaceInstUsesWith(II, ConcatVec);
2572 }
2573 }
2574
2575 return std::nullopt;
2576}
2577
2578static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2579 IntrinsicInst &II) {
2580 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2581 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2582 Value *A, *B;
2583 if (match(II.getArgOperand(0),
2586 m_Specific(A), m_Specific(B))))
2587 return IC.replaceInstUsesWith(
2588 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2589
2590 return std::nullopt;
2591}
2592
2593static std::optional<Instruction *>
2595 Value *Mask = II.getOperand(0);
2596 Value *BasePtr = II.getOperand(1);
2597 Value *Index = II.getOperand(2);
2598 Type *Ty = II.getType();
2599 Value *PassThru = ConstantAggregateZero::get(Ty);
2600
2601 // Contiguous gather => masked load.
2602 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2603 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2604 Value *IndexBase;
2606 m_Value(IndexBase), m_SpecificInt(1)))) {
2607 Align Alignment =
2608 BasePtr->getPointerAlignment(II.getDataLayout());
2609
2610 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2611 BasePtr, IndexBase);
2612 CallInst *MaskedLoad =
2613 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2614 MaskedLoad->takeName(&II);
2615 return IC.replaceInstUsesWith(II, MaskedLoad);
2616 }
2617
2618 return std::nullopt;
2619}
2620
2621static std::optional<Instruction *>
2623 Value *Val = II.getOperand(0);
2624 Value *Mask = II.getOperand(1);
2625 Value *BasePtr = II.getOperand(2);
2626 Value *Index = II.getOperand(3);
2627 Type *Ty = Val->getType();
2628
2629 // Contiguous scatter => masked store.
2630 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2631 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2632 Value *IndexBase;
2634 m_Value(IndexBase), m_SpecificInt(1)))) {
2635 Align Alignment =
2636 BasePtr->getPointerAlignment(II.getDataLayout());
2637
2638 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2639 BasePtr, IndexBase);
2640 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2641
2642 return IC.eraseInstFromFunction(II);
2643 }
2644
2645 return std::nullopt;
2646}
2647
2648static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2649 IntrinsicInst &II) {
2651 Value *Pred = II.getOperand(0);
2652 Value *Vec = II.getOperand(1);
2653 Value *DivVec = II.getOperand(2);
2654
2655 Value *SplatValue = getSplatValue(DivVec);
2656 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2657 if (!SplatConstantInt)
2658 return std::nullopt;
2659
2660 APInt Divisor = SplatConstantInt->getValue();
2661 const int64_t DivisorValue = Divisor.getSExtValue();
2662 if (DivisorValue == -1)
2663 return std::nullopt;
2664 if (DivisorValue == 1)
2665 IC.replaceInstUsesWith(II, Vec);
2666
2667 if (Divisor.isPowerOf2()) {
2668 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2669 auto ASRD = IC.Builder.CreateIntrinsic(
2670 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2671 return IC.replaceInstUsesWith(II, ASRD);
2672 }
2673 if (Divisor.isNegatedPowerOf2()) {
2674 Divisor.negate();
2675 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2676 auto ASRD = IC.Builder.CreateIntrinsic(
2677 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2678 auto NEG = IC.Builder.CreateIntrinsic(
2679 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2680 return IC.replaceInstUsesWith(II, NEG);
2681 }
2682
2683 return std::nullopt;
2684}
2685
2686bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2687 size_t VecSize = Vec.size();
2688 if (VecSize == 1)
2689 return true;
2690 if (!isPowerOf2_64(VecSize))
2691 return false;
2692 size_t HalfVecSize = VecSize / 2;
2693
2694 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2695 RHS != Vec.end(); LHS++, RHS++) {
2696 if (*LHS != nullptr && *RHS != nullptr) {
2697 if (*LHS == *RHS)
2698 continue;
2699 else
2700 return false;
2701 }
2702 if (!AllowPoison)
2703 return false;
2704 if (*LHS == nullptr && *RHS != nullptr)
2705 *LHS = *RHS;
2706 }
2707
2708 Vec.resize(HalfVecSize);
2709 SimplifyValuePattern(Vec, AllowPoison);
2710 return true;
2711}
2712
2713// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2714// to dupqlane(f64(C)) where C is A concatenated with B
2715static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2716 IntrinsicInst &II) {
2717 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2718 if (!match(II.getOperand(0),
2720 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2721 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2722 return std::nullopt;
2723 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2724
2725 // Insert the scalars into a container ordered by InsertElement index
2726 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2727 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2728 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2729 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2730 CurrentInsertElt = InsertElt->getOperand(0);
2731 }
2732
2733 bool AllowPoison =
2734 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2735 if (!SimplifyValuePattern(Elts, AllowPoison))
2736 return std::nullopt;
2737
2738 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2739 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2740 for (size_t I = 0; I < Elts.size(); I++) {
2741 if (Elts[I] == nullptr)
2742 continue;
2743 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2744 IC.Builder.getInt64(I));
2745 }
2746 if (InsertEltChain == nullptr)
2747 return std::nullopt;
2748
2749 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2750 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2751 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2752 // be narrowed back to the original type.
2753 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2754 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2755 IIScalableTy->getMinNumElements() /
2756 PatternWidth;
2757
2758 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2759 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2760 auto *WideShuffleMaskTy =
2761 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2762
2763 auto InsertSubvector = IC.Builder.CreateInsertVector(
2764 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2765 uint64_t(0));
2766 auto WideBitcast =
2767 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2768 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2769 auto WideShuffle = IC.Builder.CreateShuffleVector(
2770 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2771 auto NarrowBitcast =
2772 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2773
2774 return IC.replaceInstUsesWith(II, NarrowBitcast);
2775}
2776
2777static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2778 IntrinsicInst &II) {
2779 Value *A = II.getArgOperand(0);
2780 Value *B = II.getArgOperand(1);
2781 if (A == B)
2782 return IC.replaceInstUsesWith(II, A);
2783
2784 return std::nullopt;
2785}
2786
2787static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2788 IntrinsicInst &II) {
2789 Value *Pred = II.getOperand(0);
2790 Value *Vec = II.getOperand(1);
2791 Value *Shift = II.getOperand(2);
2792
2793 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2794 Value *AbsPred, *MergedValue;
2796 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2798 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2799
2800 return std::nullopt;
2801
2802 // Transform is valid if any of the following are true:
2803 // * The ABS merge value is an undef or non-negative
2804 // * The ABS predicate is all active
2805 // * The ABS predicate and the SRSHL predicates are the same
2806 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2807 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2808 return std::nullopt;
2809
2810 // Only valid when the shift amount is non-negative, otherwise the rounding
2811 // behaviour of SRSHL cannot be ignored.
2812 if (!match(Shift, m_NonNegative()))
2813 return std::nullopt;
2814
2815 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2816 {II.getType()}, {Pred, Vec, Shift});
2817
2818 return IC.replaceInstUsesWith(II, LSL);
2819}
2820
2821static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2822 IntrinsicInst &II) {
2823 Value *Vec = II.getOperand(0);
2824
2825 if (getSplatValue(Vec) == II.getOperand(1))
2826 return IC.replaceInstUsesWith(II, Vec);
2827
2828 return std::nullopt;
2829}
2830
2831static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2832 IntrinsicInst &II) {
2833 // If this barrier is post-dominated by identical one we can remove it
2834 auto *NI = II.getNextNode();
2835 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2836 auto CanSkipOver = [](Instruction *I) {
2837 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2838 };
2839 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2840 auto *NIBB = NI->getParent();
2841 NI = NI->getNextNode();
2842 if (!NI) {
2843 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2844 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2845 else
2846 break;
2847 }
2848 }
2849 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2850 if (NextII && II.isIdenticalTo(NextII))
2851 return IC.eraseInstFromFunction(II);
2852
2853 return std::nullopt;
2854}
2855
2856static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2857 IntrinsicInst &II) {
2858 return IC.replaceInstUsesWith(
2859 II,
2860 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2861 {II.getType(), II.getOperand(0)->getType()},
2862 {II.getOperand(0), II.getOperand(1)}));
2863}
2864
2865static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2866 IntrinsicInst &II) {
2868 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2869 return std::nullopt;
2870}
2871
2872static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2874 unsigned NumBits) {
2875 Value *Passthru = II.getOperand(0);
2876 Value *Pg = II.getOperand(1);
2877 Value *Op = II.getOperand(2);
2878
2879 // Convert UXT[BHW] to AND.
2880 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2881 auto *Ty = cast<VectorType>(II.getType());
2882 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2883 auto *Mask = ConstantInt::get(Ty, MaskValue);
2884 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2885 {Pg, Op, Mask});
2886 return IC.replaceInstUsesWith(II, And);
2887 }
2888
2889 return std::nullopt;
2890}
2891
2892static std::optional<Instruction *>
2894 SMEAttrs FnSMEAttrs(*II.getFunction());
2895 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2896 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2897 return IC.replaceInstUsesWith(
2898 II, ConstantInt::getBool(II.getType(), IsStreaming));
2899 return std::nullopt;
2900}
2901
2902std::optional<Instruction *>
2904 IntrinsicInst &II) const {
2906 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2907 return I;
2908
2909 Intrinsic::ID IID = II.getIntrinsicID();
2910 switch (IID) {
2911 default:
2912 break;
2913 case Intrinsic::aarch64_dmb:
2914 return instCombineDMB(IC, II);
2915 case Intrinsic::aarch64_neon_fmaxnm:
2916 case Intrinsic::aarch64_neon_fminnm:
2917 return instCombineMaxMinNM(IC, II);
2918 case Intrinsic::aarch64_sve_convert_from_svbool:
2919 return instCombineConvertFromSVBool(IC, II);
2920 case Intrinsic::aarch64_sve_dup:
2921 return instCombineSVEDup(IC, II);
2922 case Intrinsic::aarch64_sve_dup_x:
2923 return instCombineSVEDupX(IC, II);
2924 case Intrinsic::aarch64_sve_cmpne:
2925 case Intrinsic::aarch64_sve_cmpne_wide:
2926 return instCombineSVECmpNE(IC, II);
2927 case Intrinsic::aarch64_sve_rdffr:
2928 return instCombineRDFFR(IC, II);
2929 case Intrinsic::aarch64_sve_lasta:
2930 case Intrinsic::aarch64_sve_lastb:
2931 return instCombineSVELast(IC, II);
2932 case Intrinsic::aarch64_sve_clasta_n:
2933 case Intrinsic::aarch64_sve_clastb_n:
2934 return instCombineSVECondLast(IC, II);
2935 case Intrinsic::aarch64_sve_cntd:
2936 return instCombineSVECntElts(IC, II, 2);
2937 case Intrinsic::aarch64_sve_cntw:
2938 return instCombineSVECntElts(IC, II, 4);
2939 case Intrinsic::aarch64_sve_cnth:
2940 return instCombineSVECntElts(IC, II, 8);
2941 case Intrinsic::aarch64_sve_cntb:
2942 return instCombineSVECntElts(IC, II, 16);
2943 case Intrinsic::aarch64_sme_cntsd:
2944 return instCombineSMECntsd(IC, II, ST);
2945 case Intrinsic::aarch64_sve_ptest_any:
2946 case Intrinsic::aarch64_sve_ptest_first:
2947 case Intrinsic::aarch64_sve_ptest_last:
2948 return instCombineSVEPTest(IC, II);
2949 case Intrinsic::aarch64_sve_fadd:
2950 return instCombineSVEVectorFAdd(IC, II);
2951 case Intrinsic::aarch64_sve_fadd_u:
2952 return instCombineSVEVectorFAddU(IC, II);
2953 case Intrinsic::aarch64_sve_fmul_u:
2954 return instCombineSVEVectorBinOp(IC, II);
2955 case Intrinsic::aarch64_sve_fsub:
2956 return instCombineSVEVectorFSub(IC, II);
2957 case Intrinsic::aarch64_sve_fsub_u:
2958 return instCombineSVEVectorFSubU(IC, II);
2959 case Intrinsic::aarch64_sve_add:
2960 return instCombineSVEVectorAdd(IC, II);
2961 case Intrinsic::aarch64_sve_add_u:
2962 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2963 Intrinsic::aarch64_sve_mla_u>(
2964 IC, II, true);
2965 case Intrinsic::aarch64_sve_sub:
2966 return instCombineSVEVectorSub(IC, II);
2967 case Intrinsic::aarch64_sve_sub_u:
2968 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2969 Intrinsic::aarch64_sve_mls_u>(
2970 IC, II, true);
2971 case Intrinsic::aarch64_sve_tbl:
2972 return instCombineSVETBL(IC, II);
2973 case Intrinsic::aarch64_sve_uunpkhi:
2974 case Intrinsic::aarch64_sve_uunpklo:
2975 case Intrinsic::aarch64_sve_sunpkhi:
2976 case Intrinsic::aarch64_sve_sunpklo:
2977 return instCombineSVEUnpack(IC, II);
2978 case Intrinsic::aarch64_sve_uzp1:
2979 return instCombineSVEUzp1(IC, II);
2980 case Intrinsic::aarch64_sve_zip1:
2981 case Intrinsic::aarch64_sve_zip2:
2982 return instCombineSVEZip(IC, II);
2983 case Intrinsic::aarch64_sve_ld1_gather_index:
2984 return instCombineLD1GatherIndex(IC, II);
2985 case Intrinsic::aarch64_sve_st1_scatter_index:
2986 return instCombineST1ScatterIndex(IC, II);
2987 case Intrinsic::aarch64_sve_ld1:
2988 return instCombineSVELD1(IC, II, DL);
2989 case Intrinsic::aarch64_sve_st1:
2990 return instCombineSVEST1(IC, II, DL);
2991 case Intrinsic::aarch64_sve_sdiv:
2992 return instCombineSVESDIV(IC, II);
2993 case Intrinsic::aarch64_sve_sel:
2994 return instCombineSVESel(IC, II);
2995 case Intrinsic::aarch64_sve_srshl:
2996 return instCombineSVESrshl(IC, II);
2997 case Intrinsic::aarch64_sve_dupq_lane:
2998 return instCombineSVEDupqLane(IC, II);
2999 case Intrinsic::aarch64_sve_insr:
3000 return instCombineSVEInsr(IC, II);
3001 case Intrinsic::aarch64_sve_whilelo:
3002 return instCombineWhilelo(IC, II);
3003 case Intrinsic::aarch64_sve_ptrue:
3004 return instCombinePTrue(IC, II);
3005 case Intrinsic::aarch64_sve_uxtb:
3006 return instCombineSVEUxt(IC, II, 8);
3007 case Intrinsic::aarch64_sve_uxth:
3008 return instCombineSVEUxt(IC, II, 16);
3009 case Intrinsic::aarch64_sve_uxtw:
3010 return instCombineSVEUxt(IC, II, 32);
3011 case Intrinsic::aarch64_sme_in_streaming_mode:
3012 return instCombineInStreamingMode(IC, II);
3013 }
3014
3015 return std::nullopt;
3016}
3017
3019 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3020 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3021 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3022 SimplifyAndSetOp) const {
3023 switch (II.getIntrinsicID()) {
3024 default:
3025 break;
3026 case Intrinsic::aarch64_neon_fcvtxn:
3027 case Intrinsic::aarch64_neon_rshrn:
3028 case Intrinsic::aarch64_neon_sqrshrn:
3029 case Intrinsic::aarch64_neon_sqrshrun:
3030 case Intrinsic::aarch64_neon_sqshrn:
3031 case Intrinsic::aarch64_neon_sqshrun:
3032 case Intrinsic::aarch64_neon_sqxtn:
3033 case Intrinsic::aarch64_neon_sqxtun:
3034 case Intrinsic::aarch64_neon_uqrshrn:
3035 case Intrinsic::aarch64_neon_uqshrn:
3036 case Intrinsic::aarch64_neon_uqxtn:
3037 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3038 break;
3039 }
3040
3041 return std::nullopt;
3042}
3043
3045 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3047}
3048
3051 switch (K) {
3053 return TypeSize::getFixed(64);
3055 if (ST->useSVEForFixedLengthVectors() &&
3056 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3057 return TypeSize::getFixed(
3058 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3059 else if (ST->isNeonAvailable())
3060 return TypeSize::getFixed(128);
3061 else
3062 return TypeSize::getFixed(0);
3064 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3066 return TypeSize::getScalable(128);
3067 else
3068 return TypeSize::getScalable(0);
3069 }
3070 llvm_unreachable("Unsupported register kind");
3071}
3072
3073bool AArch64TTIImpl::isSingleExtWideningInstruction(
3074 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3075 Type *SrcOverrideTy) const {
3076 // A helper that returns a vector type from the given type. The number of
3077 // elements in type Ty determines the vector width.
3078 auto toVectorTy = [&](Type *ArgTy) {
3079 return VectorType::get(ArgTy->getScalarType(),
3080 cast<VectorType>(DstTy)->getElementCount());
3081 };
3082
3083 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3084 // i32, i64]. SVE doesn't generally have the same set of instructions to
3085 // perform an extend with the add/sub/mul. There are SMULLB style
3086 // instructions, but they operate on top/bottom, requiring some sort of lane
3087 // interleaving to be used with zext/sext.
3088 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3089 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3090 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3091 return false;
3092
3093 Type *SrcTy = SrcOverrideTy;
3094 switch (Opcode) {
3095 case Instruction::Add: // UADDW(2), SADDW(2).
3096 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3097 // The second operand needs to be an extend
3098 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3099 if (!SrcTy)
3100 SrcTy =
3101 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3102 break;
3103 }
3104
3105 if (Opcode == Instruction::Sub)
3106 return false;
3107
3108 // UADDW(2), SADDW(2) can be commutted.
3109 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3110 if (!SrcTy)
3111 SrcTy =
3112 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3113 break;
3114 }
3115 return false;
3116 }
3117 default:
3118 return false;
3119 }
3120
3121 // Legalize the destination type and ensure it can be used in a widening
3122 // operation.
3123 auto DstTyL = getTypeLegalizationCost(DstTy);
3124 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3125 return false;
3126
3127 // Legalize the source type and ensure it can be used in a widening
3128 // operation.
3129 assert(SrcTy && "Expected some SrcTy");
3130 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3131 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3132 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3133 return false;
3134
3135 // Get the total number of vector elements in the legalized types.
3136 InstructionCost NumDstEls =
3137 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3138 InstructionCost NumSrcEls =
3139 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3140
3141 // Return true if the legalized types have the same number of vector elements
3142 // and the destination element type size is twice that of the source type.
3143 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3144}
3145
3146Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3148 Type *SrcOverrideTy) const {
3149 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3150 Opcode != Instruction::Mul)
3151 return nullptr;
3152
3153 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3154 // i32, i64]. SVE doesn't generally have the same set of instructions to
3155 // perform an extend with the add/sub/mul. There are SMULLB style
3156 // instructions, but they operate on top/bottom, requiring some sort of lane
3157 // interleaving to be used with zext/sext.
3158 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3159 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3160 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3161 return nullptr;
3162
3163 auto getScalarSizeWithOverride = [&](const Value *V) {
3164 if (SrcOverrideTy)
3165 return SrcOverrideTy->getScalarSizeInBits();
3166 return cast<Instruction>(V)
3167 ->getOperand(0)
3168 ->getType()
3169 ->getScalarSizeInBits();
3170 };
3171
3172 unsigned MaxEltSize = 0;
3173 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3174 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3175 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3176 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3177 MaxEltSize = std::max(EltSize0, EltSize1);
3178 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3179 isa<SExtInst, ZExtInst>(Args[1])) {
3180 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3181 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3182 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3183 // enough.
3184 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3185 return nullptr;
3186 MaxEltSize = DstEltSize / 2;
3187 } else if (Opcode == Instruction::Mul &&
3188 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3189 // If one of the operands is a Zext and the other has enough zero bits
3190 // to be treated as unsigned, we can still generate a umull, meaning the
3191 // zext is free.
3192 KnownBits Known =
3193 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3194 if (Args[0]->getType()->getScalarSizeInBits() -
3195 Known.Zero.countLeadingOnes() >
3196 DstTy->getScalarSizeInBits() / 2)
3197 return nullptr;
3198
3199 MaxEltSize =
3200 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3201 } else
3202 return nullptr;
3203
3204 if (MaxEltSize * 2 > DstEltSize)
3205 return nullptr;
3206
3207 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3208 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3209 return nullptr;
3210 return ExtTy;
3211}
3212
3213// s/urhadd instructions implement the following pattern, making the
3214// extends free:
3215// %x = add ((zext i8 -> i16), 1)
3216// %y = (zext i8 -> i16)
3217// trunc i16 (lshr (add %x, %y), 1) -> i8
3218//
3220 Type *Src) const {
3221 // The source should be a legal vector type.
3222 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3223 (Src->isScalableTy() && !ST->hasSVE2()))
3224 return false;
3225
3226 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3227 return false;
3228
3229 // Look for trunc/shl/add before trying to match the pattern.
3230 const Instruction *Add = ExtUser;
3231 auto *AddUser =
3232 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3233 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3234 Add = AddUser;
3235
3236 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3237 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3238 return false;
3239
3240 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3241 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3242 Src->getScalarSizeInBits() !=
3243 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3244 return false;
3245
3246 // Try to match the whole pattern. Ext could be either the first or second
3247 // m_ZExtOrSExt matched.
3248 Instruction *Ex1, *Ex2;
3249 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3250 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3251 return false;
3252
3253 // Ensure both extends are of the same type
3254 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3255 Ex1->getOpcode() == Ex2->getOpcode())
3256 return true;
3257
3258 return false;
3259}
3260
3262 Type *Src,
3265 const Instruction *I) const {
3266 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3267 assert(ISD && "Invalid opcode");
3268 // If the cast is observable, and it is used by a widening instruction (e.g.,
3269 // uaddl, saddw, etc.), it may be free.
3270 if (I && I->hasOneUser()) {
3271 auto *SingleUser = cast<Instruction>(*I->user_begin());
3272 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3273 if (Type *ExtTy = isBinExtWideningInstruction(
3274 SingleUser->getOpcode(), Dst, Operands,
3275 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3276 // The cost from Src->Src*2 needs to be added if required, the cost from
3277 // Src*2->ExtTy is free.
3278 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3279 Type *DoubleSrcTy =
3280 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3281 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3283 }
3284
3285 return 0;
3286 }
3287
3288 if (isSingleExtWideningInstruction(
3289 SingleUser->getOpcode(), Dst, Operands,
3290 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3291 // For adds only count the second operand as free if both operands are
3292 // extends but not the same operation. (i.e both operands are not free in
3293 // add(sext, zext)).
3294 if (SingleUser->getOpcode() == Instruction::Add) {
3295 if (I == SingleUser->getOperand(1) ||
3296 (isa<CastInst>(SingleUser->getOperand(1)) &&
3297 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3298 return 0;
3299 } else {
3300 // Others are free so long as isSingleExtWideningInstruction
3301 // returned true.
3302 return 0;
3303 }
3304 }
3305
3306 // The cast will be free for the s/urhadd instructions
3307 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3308 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3309 return 0;
3310 }
3311
3312 EVT SrcTy = TLI->getValueType(DL, Src);
3313 EVT DstTy = TLI->getValueType(DL, Dst);
3314
3315 if (!SrcTy.isSimple() || !DstTy.isSimple())
3316 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3317
3318 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3319 // we use fcvtx under SVE2. Give them invalid costs.
3320 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3321 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3322 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3324
3325 static const TypeConversionCostTblEntry BF16Tbl[] = {
3326 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3327 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3328 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3329 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3330 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3331 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3332 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3333 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3334 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3335 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3336 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3337 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3338 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3339 };
3340
3341 if (ST->hasBF16())
3342 if (const auto *Entry = ConvertCostTableLookup(
3343 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3344 return Entry->Cost;
3345
3346 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3347 // The cost of unpacking twice is artificially increased for now in order
3348 // to avoid regressions against NEON, which will use tbl instructions directly
3349 // instead of multiple layers of [s|u]unpk[lo|hi].
3350 // We use the unpacks in cases where the destination type is illegal and
3351 // requires splitting of the input, even if the input type itself is legal.
3352 const unsigned int SVE_EXT_COST = 1;
3353 const unsigned int SVE_FCVT_COST = 1;
3354 const unsigned int SVE_UNPACK_ONCE = 4;
3355 const unsigned int SVE_UNPACK_TWICE = 16;
3356
3357 static const TypeConversionCostTblEntry ConversionTbl[] = {
3358 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3359 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3360 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3361 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3362 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3363 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3364 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3365 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3366 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3367 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3368 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3369 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3370 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3371 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3372 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3373 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3374 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3375 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3376 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3377 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3378
3379 // Truncations on nxvmiN
3380 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3381 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3382 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3383 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3384 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3385 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3386 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3387 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3388 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3389 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3390 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3391 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3392 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3393 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3394 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3395 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3396 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3397 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3398 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3399 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3400 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3401 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3402 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3403 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3404 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3405 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3406 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3407 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3408 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3409 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3410 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3411 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3412 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3413
3414 // The number of shll instructions for the extension.
3415 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3416 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3417 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3418 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3419 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3420 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3421 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3422 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3423 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3424 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3425 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3426 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3427 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3428 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3429 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3430 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3431
3432 // FP Ext and trunc
3433 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3434 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3435 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3436 // FP16
3437 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3438 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3439 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3440 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3441 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3442 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3443 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3444 // BF16 (uses shift)
3445 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3446 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3447 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3448 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3449 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3450 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3451 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3452 // FP Ext and trunc
3453 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3454 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3455 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3456 // FP16
3457 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3458 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3459 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3460 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3461 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3462 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3463 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3464 // BF16 (more complex, with +bf16 is handled above)
3465 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3466 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3467 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3468 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3469 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3470 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3471 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3472 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3473
3474 // LowerVectorINT_TO_FP:
3475 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3476 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3477 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3478 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3479 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3480 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3481
3482 // SVE: to nxv2f16
3483 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3484 SVE_EXT_COST + SVE_FCVT_COST},
3485 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3486 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3487 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3488 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3489 SVE_EXT_COST + SVE_FCVT_COST},
3490 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3491 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3492 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3493
3494 // SVE: to nxv4f16
3495 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3496 SVE_EXT_COST + SVE_FCVT_COST},
3497 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3498 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3499 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3500 SVE_EXT_COST + SVE_FCVT_COST},
3501 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3502 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3503
3504 // SVE: to nxv8f16
3505 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3506 SVE_EXT_COST + SVE_FCVT_COST},
3507 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3508 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3509 SVE_EXT_COST + SVE_FCVT_COST},
3510 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3511
3512 // SVE: to nxv16f16
3513 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3514 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3515 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3516 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3517
3518 // Complex: to v2f32
3519 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3520 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3521 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3522 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3523
3524 // SVE: to nxv2f32
3525 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3526 SVE_EXT_COST + SVE_FCVT_COST},
3527 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3528 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3529 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3530 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3531 SVE_EXT_COST + SVE_FCVT_COST},
3532 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3533 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3534 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3535
3536 // Complex: to v4f32
3537 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3538 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3539 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3540 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3541
3542 // SVE: to nxv4f32
3543 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3544 SVE_EXT_COST + SVE_FCVT_COST},
3545 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3546 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3547 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3548 SVE_EXT_COST + SVE_FCVT_COST},
3549 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3550 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3551
3552 // Complex: to v8f32
3553 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3554 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3555 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3556 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3557
3558 // SVE: to nxv8f32
3559 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3560 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3561 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3562 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3563 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3564 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3565 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3566 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3567
3568 // SVE: to nxv16f32
3569 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3570 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3571 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3572 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3573
3574 // Complex: to v16f32
3575 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3576 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3577
3578 // Complex: to v2f64
3579 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3580 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3581 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3582 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3583 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3584 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3585
3586 // SVE: to nxv2f64
3587 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3588 SVE_EXT_COST + SVE_FCVT_COST},
3589 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3590 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3591 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3592 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3593 SVE_EXT_COST + SVE_FCVT_COST},
3594 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3595 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3596 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3597
3598 // Complex: to v4f64
3599 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3600 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3601
3602 // SVE: to nxv4f64
3603 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3604 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3605 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3606 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3607 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3608 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3609 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3610 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3611 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3612 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3613 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3614 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3615
3616 // SVE: to nxv8f64
3617 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3618 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3619 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3620 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3621 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3622 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3623 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3624 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3625
3626 // LowerVectorFP_TO_INT
3627 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3628 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3629 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3630 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3631 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3632 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3633
3634 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3635 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3636 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3637 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3638 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3639 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3640 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3641
3642 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3643 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3644 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3645 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3646 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3647
3648 // Complex, from nxv2f32.
3649 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3650 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3651 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3652 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3653 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3654 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3655 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3656 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3657
3658 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3659 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3660 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3661 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3662 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3663 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3664 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3665
3666 // Complex, from nxv2f64.
3667 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3668 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3669 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3670 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3671 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3672 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3673 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3674 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3675 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3676 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3677
3678 // Complex, from nxv4f32.
3679 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3680 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3681 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3682 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3683 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3684 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3685 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3686 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3687 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3688 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3689
3690 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3691 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3692 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3693 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3694 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3695
3696 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3697 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3698 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3699 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3700 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3701 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3702 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3703
3704 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3705 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3706 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3707 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3708 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3709
3710 // Complex, from nxv8f16.
3711 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3712 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3713 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3714 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3715 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3716 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3717 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3718 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3719 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3720 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3721
3722 // Complex, from nxv4f16.
3723 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3724 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3725 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3726 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3727 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3728 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3729 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3730 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3731
3732 // Complex, from nxv2f16.
3733 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3734 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3735 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3736 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3737 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3738 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3739 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3740 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3741
3742 // Truncate from nxvmf32 to nxvmf16.
3743 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3744 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3745 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3746
3747 // Truncate from nxvmf32 to nxvmbf16.
3748 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3749 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3750 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3751
3752 // Truncate from nxvmf64 to nxvmf16.
3753 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3754 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3755 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3756
3757 // Truncate from nxvmf64 to nxvmbf16.
3758 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3759 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3760 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3761
3762 // Truncate from nxvmf64 to nxvmf32.
3763 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3764 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3765 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3766
3767 // Extend from nxvmf16 to nxvmf32.
3768 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3769 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3770 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3771
3772 // Extend from nxvmbf16 to nxvmf32.
3773 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3774 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3775 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3776
3777 // Extend from nxvmf16 to nxvmf64.
3778 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3779 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3780 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3781
3782 // Extend from nxvmbf16 to nxvmf64.
3783 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3784 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3785 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3786
3787 // Extend from nxvmf32 to nxvmf64.
3788 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3789 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3790 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3791
3792 // Bitcasts from float to integer
3793 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3794 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3795 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3796
3797 // Bitcasts from integer to float
3798 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3799 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3800 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3801
3802 // Add cost for extending to illegal -too wide- scalable vectors.
3803 // zero/sign extend are implemented by multiple unpack operations,
3804 // where each operation has a cost of 1.
3805 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3806 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3807 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3808 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3809 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3810 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3811
3812 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3813 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3814 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3815 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3816 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3817 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3818 };
3819
3820 // We have to estimate a cost of fixed length operation upon
3821 // SVE registers(operations) with the number of registers required
3822 // for a fixed type to be represented upon SVE registers.
3823 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3824 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3825 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3826 ST->useSVEForFixedLengthVectors(WiderTy)) {
3827 std::pair<InstructionCost, MVT> LT =
3828 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3829 unsigned NumElements =
3830 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3831 return LT.first *
3833 Opcode,
3834 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3835 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3836 CostKind, I);
3837 }
3838
3839 if (const auto *Entry = ConvertCostTableLookup(
3840 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3841 return Entry->Cost;
3842
3843 static const TypeConversionCostTblEntry FP16Tbl[] = {
3844 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3845 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3846 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3847 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3848 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3849 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3850 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3851 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3852 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3853 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3854 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3855 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3856 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3857 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3858 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3859 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3860 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3861 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3862 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3863 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3864 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3865 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3866 };
3867
3868 if (ST->hasFullFP16())
3869 if (const auto *Entry = ConvertCostTableLookup(
3870 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3871 return Entry->Cost;
3872
3873 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3874 // double-rounding issues.
3875 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3876 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3878 return cast<FixedVectorType>(Dst)->getNumElements() *
3879 getCastInstrCost(Opcode, Dst->getScalarType(),
3880 Src->getScalarType(), CCH, CostKind) +
3882 true, CostKind) +
3884 false, CostKind);
3885
3886 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3888 ST->isSVEorStreamingSVEAvailable() &&
3889 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3891 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3893 // The standard behaviour in the backend for these cases is to split the
3894 // extend up into two parts:
3895 // 1. Perform an extending load or masked load up to the legal type.
3896 // 2. Extend the loaded data to the final type.
3897 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3898 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3900 Opcode, LegalTy, Src, CCH, CostKind, I);
3902 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3903 return Part1 + Part2;
3904 }
3905
3906 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3907 // but we also want to include the TTI::CastContextHint::Masked case too.
3908 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3910 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3912
3913 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3914}
3915
3918 VectorType *VecTy, unsigned Index,
3920
3921 // Make sure we were given a valid extend opcode.
3922 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3923 "Invalid opcode");
3924
3925 // We are extending an element we extract from a vector, so the source type
3926 // of the extend is the element type of the vector.
3927 auto *Src = VecTy->getElementType();
3928
3929 // Sign- and zero-extends are for integer types only.
3930 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3931
3932 // Get the cost for the extract. We compute the cost (if any) for the extend
3933 // below.
3934 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3935 CostKind, Index, nullptr, nullptr);
3936
3937 // Legalize the types.
3938 auto VecLT = getTypeLegalizationCost(VecTy);
3939 auto DstVT = TLI->getValueType(DL, Dst);
3940 auto SrcVT = TLI->getValueType(DL, Src);
3941
3942 // If the resulting type is still a vector and the destination type is legal,
3943 // we may get the extension for free. If not, get the default cost for the
3944 // extend.
3945 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3946 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3947 CostKind);
3948
3949 // The destination type should be larger than the element type. If not, get
3950 // the default cost for the extend.
3951 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3952 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3953 CostKind);
3954
3955 switch (Opcode) {
3956 default:
3957 llvm_unreachable("Opcode should be either SExt or ZExt");
3958
3959 // For sign-extends, we only need a smov, which performs the extension
3960 // automatically.
3961 case Instruction::SExt:
3962 return Cost;
3963
3964 // For zero-extends, the extend is performed automatically by a umov unless
3965 // the destination type is i64 and the element type is i8 or i16.
3966 case Instruction::ZExt:
3967 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3968 return Cost;
3969 }
3970
3971 // If we are unable to perform the extend for free, get the default cost.
3972 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3973 CostKind);
3974}
3975
3978 const Instruction *I) const {
3980 return Opcode == Instruction::PHI ? 0 : 1;
3981 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3982 // Branches are assumed to be predicted.
3983 return 0;
3984}
3985
3986InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3987 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3988 const Instruction *I, Value *Scalar,
3989 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3990 assert(Val->isVectorTy() && "This must be a vector type");
3991
3992 if (Index != -1U) {
3993 // Legalize the type.
3994 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3995
3996 // This type is legalized to a scalar type.
3997 if (!LT.second.isVector())
3998 return 0;
3999
4000 // The type may be split. For fixed-width vectors we can normalize the
4001 // index to the new type.
4002 if (LT.second.isFixedLengthVector()) {
4003 unsigned Width = LT.second.getVectorNumElements();
4004 Index = Index % Width;
4005 }
4006
4007 // The element at index zero is already inside the vector.
4008 // - For a insert-element or extract-element
4009 // instruction that extracts integers, an explicit FPR -> GPR move is
4010 // needed. So it has non-zero cost.
4011 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4012 return 0;
4013
4014 // This is recognising a LD1 single-element structure to one lane of one
4015 // register instruction. I.e., if this is an `insertelement` instruction,
4016 // and its second operand is a load, then we will generate a LD1, which
4017 // are expensive instructions on some uArchs.
4018 if (I && isa<LoadInst>(I->getOperand(1))) {
4019 if (ST->hasFastLD1Single())
4020 return 0;
4021 return CostKind == TTI::TCK_CodeSize
4022 ? 0
4024 }
4025
4026 // i1 inserts and extract will include an extra cset or cmp of the vector
4027 // value. Increase the cost by 1 to account.
4028 if (Val->getScalarSizeInBits() == 1)
4029 return CostKind == TTI::TCK_CodeSize
4030 ? 2
4031 : ST->getVectorInsertExtractBaseCost() + 1;
4032
4033 // FIXME:
4034 // If the extract-element and insert-element instructions could be
4035 // simplified away (e.g., could be combined into users by looking at use-def
4036 // context), they have no cost. This is not done in the first place for
4037 // compile-time considerations.
4038 }
4039
4040 // In case of Neon, if there exists extractelement from lane != 0 such that
4041 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4042 // 2. extractelement result feeds into fmul.
4043 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4044 // equivalent to 0.
4045 // then the extractelement can be merged with fmul in the backend and it
4046 // incurs no cost.
4047 // e.g.
4048 // define double @foo(<2 x double> %a) {
4049 // %1 = extractelement <2 x double> %a, i32 0
4050 // %2 = extractelement <2 x double> %a, i32 1
4051 // %res = fmul double %1, %2
4052 // ret double %res
4053 // }
4054 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4055 auto ExtractCanFuseWithFmul = [&]() {
4056 // We bail out if the extract is from lane 0.
4057 if (Index == 0)
4058 return false;
4059
4060 // Check if the scalar element type of the vector operand of ExtractElement
4061 // instruction is one of the allowed types.
4062 auto IsAllowedScalarTy = [&](const Type *T) {
4063 return T->isFloatTy() || T->isDoubleTy() ||
4064 (T->isHalfTy() && ST->hasFullFP16());
4065 };
4066
4067 // Check if the extractelement user is scalar fmul.
4068 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4069 // Check if the user is scalar fmul.
4070 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4071 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4072 !BO->getType()->isVectorTy();
4073 };
4074
4075 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4076 // certain scalar type and a certain vector register width.
4077 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4078 auto RegWidth =
4080 .getFixedValue();
4081 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4082 };
4083
4084 // Check if the type constraints on input vector type and result scalar type
4085 // of extractelement instruction are satisfied.
4086 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4087 return false;
4088
4089 if (Scalar) {
4090 DenseMap<User *, unsigned> UserToExtractIdx;
4091 for (auto *U : Scalar->users()) {
4092 if (!IsUserFMulScalarTy(U))
4093 return false;
4094 // Recording entry for the user is important. Index value is not
4095 // important.
4096 UserToExtractIdx[U];
4097 }
4098 if (UserToExtractIdx.empty())
4099 return false;
4100 for (auto &[S, U, L] : ScalarUserAndIdx) {
4101 for (auto *U : S->users()) {
4102 if (UserToExtractIdx.contains(U)) {
4103 auto *FMul = cast<BinaryOperator>(U);
4104 auto *Op0 = FMul->getOperand(0);
4105 auto *Op1 = FMul->getOperand(1);
4106 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4107 UserToExtractIdx[U] = L;
4108 break;
4109 }
4110 }
4111 }
4112 }
4113 for (auto &[U, L] : UserToExtractIdx) {
4114 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4115 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4116 return false;
4117 }
4118 } else {
4119 const auto *EE = cast<ExtractElementInst>(I);
4120
4121 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4122 if (!IdxOp)
4123 return false;
4124
4125 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4126 if (!IsUserFMulScalarTy(U))
4127 return false;
4128
4129 // Check if the other operand of extractelement is also extractelement
4130 // from lane equivalent to 0.
4131 const auto *BO = cast<BinaryOperator>(U);
4132 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4133 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4134 if (OtherEE) {
4135 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4136 if (!IdxOp)
4137 return false;
4138 return IsExtractLaneEquivalentToZero(
4139 cast<ConstantInt>(OtherEE->getIndexOperand())
4140 ->getValue()
4141 .getZExtValue(),
4142 OtherEE->getType()->getScalarSizeInBits());
4143 }
4144 return true;
4145 });
4146 }
4147 return true;
4148 };
4149
4150 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4151 ExtractCanFuseWithFmul())
4152 return 0;
4153
4154 // All other insert/extracts cost this much.
4155 return CostKind == TTI::TCK_CodeSize ? 1
4156 : ST->getVectorInsertExtractBaseCost();
4157}
4158
4161 unsigned Index,
4162 const Value *Op0,
4163 const Value *Op1) const {
4164 // Treat insert at lane 0 into a poison vector as having zero cost. This
4165 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4166 // single dup) are treated as cheap.
4167 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4168 isa<PoisonValue>(Op0))
4169 return 0;
4170 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4171}
4172
4174 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4175 Value *Scalar,
4176 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4177 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4178 ScalarUserAndIdx);
4179}
4180
4182 Type *Val,
4184 unsigned Index) const {
4185 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4186}
4187
4191 unsigned Index) const {
4192 if (isa<FixedVectorType>(Val))
4194 Index);
4195
4196 // This typically requires both while and lastb instructions in order
4197 // to extract the last element. If this is in a loop the while
4198 // instruction can at least be hoisted out, although it will consume a
4199 // predicate register. The cost should be more expensive than the base
4200 // extract cost, which is 2 for most CPUs.
4201 return CostKind == TTI::TCK_CodeSize
4202 ? 2
4203 : ST->getVectorInsertExtractBaseCost() + 1;
4204}
4205
4207 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4208 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4209 ArrayRef<Value *> VL) const {
4212 if (Ty->getElementType()->isFloatingPointTy())
4213 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4214 CostKind);
4215 unsigned VecInstCost =
4216 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4217 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4218}
4219
4220std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4222 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4223 std::function<InstructionCost(Type *)> InstCost) const {
4224 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4225 return std::nullopt;
4226 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4227 return std::nullopt;
4228 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4229 ST->isNonStreamingSVEorSME2Available())
4230 return std::nullopt;
4231
4232 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4233 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4235 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4236 Cost *= 2;
4237 Cost += InstCost(PromotedTy);
4238 if (IncludeTrunc)
4239 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4241 return Cost;
4242}
4243
4245 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4247 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4248
4249 // The code-generator is currently not able to handle scalable vectors
4250 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4251 // it. This change will be removed when code-generation for these types is
4252 // sufficiently reliable.
4253 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4254 if (VTy->getElementCount() == ElementCount::getScalable(1))
4256
4257 // TODO: Handle more cost kinds.
4259 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4260 Op2Info, Args, CxtI);
4261
4262 // Legalize the type.
4263 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4264 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4265
4266 // Increase the cost for half and bfloat types if not architecturally
4267 // supported.
4268 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4269 ISD == ISD::FDIV || ISD == ISD::FREM)
4270 if (auto PromotedCost = getFP16BF16PromoteCost(
4271 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4272 // There is not native support for fdiv/frem even with +sve-b16b16.
4273 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4274 [&](Type *PromotedTy) {
4275 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4276 Op1Info, Op2Info);
4277 }))
4278 return *PromotedCost;
4279
4280 // If the operation is a widening instruction (smull or umull) and both
4281 // operands are extends the cost can be cheaper by considering that the
4282 // operation will operate on the narrowest type size possible (double the
4283 // largest input size) and a further extend.
4284 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4285 if (ExtTy != Ty)
4286 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4287 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4289 return LT.first;
4290 }
4291
4292 switch (ISD) {
4293 default:
4294 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4295 Op2Info);
4296 case ISD::SREM:
4297 case ISD::SDIV:
4298 /*
4299 Notes for sdiv/srem specific costs:
4300 1. This only considers the cases where the divisor is constant, uniform and
4301 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4302 result in some form of (ldr + adrp), corresponding to constant vectors, or
4303 scalarization of the division operation.
4304 2. Constant divisors, either negative in whole or partially, don't result in
4305 significantly different codegen as compared to positive constant divisors.
4306 So, we don't consider negative divisors separately.
4307 3. If the codegen is significantly different with SVE, it has been indicated
4308 using comments at appropriate places.
4309
4310 sdiv specific cases:
4311 -----------------------------------------------------------------------
4312 codegen | pow-of-2 | Type
4313 -----------------------------------------------------------------------
4314 add + cmp + csel + asr | Y | i64
4315 add + cmp + csel + asr | Y | i32
4316 -----------------------------------------------------------------------
4317
4318 srem specific cases:
4319 -----------------------------------------------------------------------
4320 codegen | pow-of-2 | Type
4321 -----------------------------------------------------------------------
4322 negs + and + and + csneg | Y | i64
4323 negs + and + and + csneg | Y | i32
4324 -----------------------------------------------------------------------
4325
4326 other sdiv/srem cases:
4327 -------------------------------------------------------------------------
4328 common codegen | + srem | + sdiv | pow-of-2 | Type
4329 -------------------------------------------------------------------------
4330 smulh + asr + add + add | - | - | N | i64
4331 smull + lsr + add + add | - | - | N | i32
4332 usra | and + sub | sshr | Y | <2 x i64>
4333 2 * (scalar code) | - | - | N | <2 x i64>
4334 usra | bic + sub | sshr + neg | Y | <4 x i32>
4335 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4336 + sshr + usra | | | |
4337 -------------------------------------------------------------------------
4338 */
4339 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4340 InstructionCost AddCost =
4341 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4342 Op1Info.getNoProps(), Op2Info.getNoProps());
4343 InstructionCost AsrCost =
4344 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4345 Op1Info.getNoProps(), Op2Info.getNoProps());
4346 InstructionCost MulCost =
4347 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4348 Op1Info.getNoProps(), Op2Info.getNoProps());
4349 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4350 // have similar cost.
4351 auto VT = TLI->getValueType(DL, Ty);
4352 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4353 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4354 // Neg can be folded into the asr instruction.
4355 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4356 : (3 * AsrCost + AddCost);
4357 } else {
4358 return MulCost + AsrCost + 2 * AddCost;
4359 }
4360 } else if (VT.isVector()) {
4361 InstructionCost UsraCost = 2 * AsrCost;
4362 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4363 // Division with scalable types corresponds to native 'asrd'
4364 // instruction when SVE is available.
4365 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4366
4367 // One more for the negation in SDIV
4369 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4370 if (Ty->isScalableTy() && ST->hasSVE())
4371 Cost += 2 * AsrCost;
4372 else {
4373 Cost +=
4374 UsraCost +
4375 (ISD == ISD::SDIV
4376 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4377 : 2 * AddCost);
4378 }
4379 return Cost;
4380 } else if (LT.second == MVT::v2i64) {
4381 return VT.getVectorNumElements() *
4382 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4383 Op1Info.getNoProps(),
4384 Op2Info.getNoProps());
4385 } else {
4386 // When SVE is available, we get:
4387 // smulh + lsr + add/sub + asr + add/sub.
4388 if (Ty->isScalableTy() && ST->hasSVE())
4389 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4390 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4391 }
4392 }
4393 }
4394 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4395 LT.second.isFixedLengthVector()) {
4396 // FIXME: When the constant vector is non-uniform, this may result in
4397 // loading the vector from constant pool or in some cases, may also result
4398 // in scalarization. For now, we are approximating this with the
4399 // scalarization cost.
4400 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4401 CostKind, -1, nullptr, nullptr);
4402 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4403 CostKind, -1, nullptr, nullptr);
4404 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4405 return ExtractCost + InsertCost +
4406 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4407 CostKind, Op1Info.getNoProps(),
4408 Op2Info.getNoProps());
4409 }
4410 [[fallthrough]];
4411 case ISD::UDIV:
4412 case ISD::UREM: {
4413 auto VT = TLI->getValueType(DL, Ty);
4414 if (Op2Info.isConstant()) {
4415 // If the operand is a power of 2 we can use the shift or and cost.
4416 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4417 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4418 Op1Info.getNoProps(),
4419 Op2Info.getNoProps());
4420 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4421 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4422 Op1Info.getNoProps(),
4423 Op2Info.getNoProps());
4424
4425 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4426 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4427 // The MULHU will be expanded to UMULL for the types not listed below,
4428 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4429 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4430 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4431 LT.second == MVT::nxv16i8;
4432 bool Is128bit = LT.second.is128BitVector();
4433
4434 InstructionCost MulCost =
4435 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4436 Op1Info.getNoProps(), Op2Info.getNoProps());
4437 InstructionCost AddCost =
4438 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4439 Op1Info.getNoProps(), Op2Info.getNoProps());
4440 InstructionCost ShrCost =
4441 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4442 Op1Info.getNoProps(), Op2Info.getNoProps());
4443 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4444 (HasMULH ? 0 : ShrCost) + // UMULL shift
4445 AddCost * 2 + ShrCost;
4446 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4447 }
4448 }
4449
4450 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4451 // emitted by the backend even when those functions are not declared in the
4452 // module.
4453 if (!VT.isVector() && VT.getSizeInBits() > 64)
4454 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4455
4457 Opcode, Ty, CostKind, Op1Info, Op2Info);
4458 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4459 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4460 // SDIV/UDIV operations are lowered using SVE, then we can have less
4461 // costs.
4462 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4463 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4464 static const CostTblEntry DivTbl[]{
4465 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4466 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4467 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4468 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4469 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4470 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4471
4472 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4473 if (nullptr != Entry)
4474 return Entry->Cost;
4475 }
4476 // For 8/16-bit elements, the cost is higher because the type
4477 // requires promotion and possibly splitting:
4478 if (LT.second.getScalarType() == MVT::i8)
4479 Cost *= 8;
4480 else if (LT.second.getScalarType() == MVT::i16)
4481 Cost *= 4;
4482 return Cost;
4483 } else {
4484 // If one of the operands is a uniform constant then the cost for each
4485 // element is Cost for insertion, extraction and division.
4486 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4487 // operation with scalar type
4488 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4489 (Op2Info.isConstant() && Op2Info.isUniform())) {
4490 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4492 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4493 return (4 + DivCost) * VTy->getNumElements();
4494 }
4495 }
4496 // On AArch64, without SVE, vector divisions are expanded
4497 // into scalar divisions of each pair of elements.
4498 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4499 -1, nullptr, nullptr);
4500 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4501 nullptr, nullptr);
4502 }
4503
4504 // TODO: if one of the arguments is scalar, then it's not necessary to
4505 // double the cost of handling the vector elements.
4506 Cost += Cost;
4507 }
4508 return Cost;
4509 }
4510 case ISD::MUL:
4511 // When SVE is available, then we can lower the v2i64 operation using
4512 // the SVE mul instruction, which has a lower cost.
4513 if (LT.second == MVT::v2i64 && ST->hasSVE())
4514 return LT.first;
4515
4516 // When SVE is not available, there is no MUL.2d instruction,
4517 // which means mul <2 x i64> is expensive as elements are extracted
4518 // from the vectors and the muls scalarized.
4519 // As getScalarizationOverhead is a bit too pessimistic, we
4520 // estimate the cost for a i64 vector directly here, which is:
4521 // - four 2-cost i64 extracts,
4522 // - two 2-cost i64 inserts, and
4523 // - two 1-cost muls.
4524 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4525 // LT.first = 2 the cost is 28.
4526 if (LT.second != MVT::v2i64)
4527 return LT.first;
4528 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4529 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4530 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4531 nullptr, nullptr) *
4532 2 +
4533 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4534 nullptr, nullptr));
4535 case ISD::ADD:
4536 case ISD::XOR:
4537 case ISD::OR:
4538 case ISD::AND:
4539 case ISD::SRL:
4540 case ISD::SRA:
4541 case ISD::SHL:
4542 // These nodes are marked as 'custom' for combining purposes only.
4543 // We know that they are legal. See LowerAdd in ISelLowering.
4544 return LT.first;
4545
4546 case ISD::FNEG:
4547 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4548 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4549 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4550 CxtI &&
4551 ((CxtI->hasOneUse() &&
4552 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4553 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4554 return 0;
4555 [[fallthrough]];
4556 case ISD::FADD:
4557 case ISD::FSUB:
4558 if (!Ty->getScalarType()->isFP128Ty())
4559 return LT.first;
4560 [[fallthrough]];
4561 case ISD::FMUL:
4562 case ISD::FDIV:
4563 // These nodes are marked as 'custom' just to lower them to SVE.
4564 // We know said lowering will incur no additional cost.
4565 if (!Ty->getScalarType()->isFP128Ty())
4566 return 2 * LT.first;
4567
4568 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4569 Op2Info);
4570 case ISD::FREM:
4571 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4572 // those functions are not declared in the module.
4573 if (!Ty->isVectorTy())
4574 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4575 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4576 Op2Info);
4577 }
4578}
4579
4582 const SCEV *Ptr,
4584 // Address computations in vectorized code with non-consecutive addresses will
4585 // likely result in more instructions compared to scalar code where the
4586 // computation can more often be merged into the index mode. The resulting
4587 // extra micro-ops can significantly decrease throughput.
4588 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4589 int MaxMergeDistance = 64;
4590
4591 if (PtrTy->isVectorTy() && SE &&
4592 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4593 return NumVectorInstToHideOverhead;
4594
4595 // In many cases the address computation is not merged into the instruction
4596 // addressing mode.
4597 return 1;
4598}
4599
4600/// Check whether Opcode1 has less throughput according to the scheduling
4601/// model than Opcode2.
4603 unsigned Opcode1, unsigned Opcode2) const {
4604 const MCSchedModel &Sched = ST->getSchedModel();
4605 const TargetInstrInfo *TII = ST->getInstrInfo();
4606 if (!Sched.hasInstrSchedModel())
4607 return false;
4608
4609 const MCSchedClassDesc *SCD1 =
4610 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4611 const MCSchedClassDesc *SCD2 =
4612 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4613 // We cannot handle variant scheduling classes without an MI. If we need to
4614 // support them for any of the instructions we query the information of we
4615 // might need to add a way to resolve them without a MI or not use the
4616 // scheduling info.
4617 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4618 "Cannot handle variant scheduling classes without an MI");
4619 if (!SCD1->isValid() || !SCD2->isValid())
4620 return false;
4621
4622 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4624}
4625
4627 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4629 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4630 // We don't lower some vector selects well that are wider than the register
4631 // width. TODO: Improve this with different cost kinds.
4632 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4633 // We would need this many instructions to hide the scalarization happening.
4634 const int AmortizationCost = 20;
4635
4636 // If VecPred is not set, check if we can get a predicate from the context
4637 // instruction, if its type matches the requested ValTy.
4638 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4639 CmpPredicate CurrentPred;
4640 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4641 m_Value())))
4642 VecPred = CurrentPred;
4643 }
4644 // Check if we have a compare/select chain that can be lowered using
4645 // a (F)CMxx & BFI pair.
4646 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4647 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4648 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4649 VecPred == CmpInst::FCMP_UNE) {
4650 static const auto ValidMinMaxTys = {
4651 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4652 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4653 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4654
4655 auto LT = getTypeLegalizationCost(ValTy);
4656 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4657 (ST->hasFullFP16() &&
4658 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4659 return LT.first;
4660 }
4661
4662 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4663 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4664 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4665 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4666 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4667 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4668 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4669 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4670 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4671 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4672 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4673 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4674
4675 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4676 EVT SelValTy = TLI->getValueType(DL, ValTy);
4677 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4678 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4679 SelCondTy.getSimpleVT(),
4680 SelValTy.getSimpleVT()))
4681 return Entry->Cost;
4682 }
4683 }
4684
4685 if (Opcode == Instruction::FCmp) {
4686 if (auto PromotedCost = getFP16BF16PromoteCost(
4687 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4688 // TODO: Consider costing SVE FCMPs.
4689 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4691 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4692 CostKind, Op1Info, Op2Info);
4693 if (isa<VectorType>(PromotedTy))
4695 Instruction::Trunc,
4699 return Cost;
4700 }))
4701 return *PromotedCost;
4702
4703 auto LT = getTypeLegalizationCost(ValTy);
4704 // Model unknown fp compares as a libcall.
4705 if (LT.second.getScalarType() != MVT::f64 &&
4706 LT.second.getScalarType() != MVT::f32 &&
4707 LT.second.getScalarType() != MVT::f16)
4708 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4709 {ValTy, ValTy}, CostKind);
4710
4711 // Some comparison operators require expanding to multiple compares + or.
4712 unsigned Factor = 1;
4713 if (!CondTy->isVectorTy() &&
4714 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4715 Factor = 2; // fcmp with 2 selects
4716 else if (isa<FixedVectorType>(ValTy) &&
4717 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4718 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4719 Factor = 3; // fcmxx+fcmyy+or
4720 else if (isa<ScalableVectorType>(ValTy) &&
4721 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4722 Factor = 3; // fcmxx+fcmyy+or
4723
4724 if (isa<ScalableVectorType>(ValTy) &&
4726 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4727 AArch64::FCMEQv4f32))
4728 Factor *= 2;
4729
4730 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4731 }
4732
4733 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4734 // icmp(and, 0) as free, as we can make use of ands, but only if the
4735 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4736 // providing it will not cause performance regressions.
4737 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4738 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4739 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4740 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4741 if (match(I->getOperand(1), m_Zero()))
4742 return 0;
4743
4744 // x >= 1 / x < 1 -> x > 0 / x <= 0
4745 if (match(I->getOperand(1), m_One()) &&
4746 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4747 return 0;
4748
4749 // x <= -1 / x > -1 -> x > 0 / x <= 0
4750 if (match(I->getOperand(1), m_AllOnes()) &&
4751 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4752 return 0;
4753 }
4754
4755 // The base case handles scalable vectors fine for now, since it treats the
4756 // cost as 1 * legalization cost.
4757 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4758 Op1Info, Op2Info, I);
4759}
4760
4762AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4764 if (ST->requiresStrictAlign()) {
4765 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4766 // a bunch of instructions when strict align is enabled.
4767 return Options;
4768 }
4769 Options.AllowOverlappingLoads = true;
4770 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4771 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4772 // TODO: Though vector loads usually perform well on AArch64, in some targets
4773 // they may wake up the FP unit, which raises the power consumption. Perhaps
4774 // they could be used with no holds barred (-O3).
4775 Options.LoadSizes = {8, 4, 2, 1};
4776 Options.AllowedTailExpansions = {3, 5, 6};
4777 return Options;
4778}
4779
4781 return ST->hasSVE();
4782}
4783
4787 switch (MICA.getID()) {
4788 case Intrinsic::masked_scatter:
4789 case Intrinsic::masked_gather:
4790 return getGatherScatterOpCost(MICA, CostKind);
4791 case Intrinsic::masked_load:
4792 case Intrinsic::masked_store:
4793 return getMaskedMemoryOpCost(MICA, CostKind);
4794 }
4796}
4797
4801 Type *Src = MICA.getDataType();
4802
4803 if (useNeonVector(Src))
4805 auto LT = getTypeLegalizationCost(Src);
4806 if (!LT.first.isValid())
4808
4809 // Return an invalid cost for element types that we are unable to lower.
4810 auto *VT = cast<VectorType>(Src);
4811 if (VT->getElementType()->isIntegerTy(1))
4813
4814 // The code-generator is currently not able to handle scalable vectors
4815 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4816 // it. This change will be removed when code-generation for these types is
4817 // sufficiently reliable.
4818 if (VT->getElementCount() == ElementCount::getScalable(1))
4820
4821 return LT.first;
4822}
4823
4824// This function returns gather/scatter overhead either from
4825// user-provided value or specialized values per-target from \p ST.
4826static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4827 const AArch64Subtarget *ST) {
4828 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4829 "Should be called on only load or stores.");
4830 switch (Opcode) {
4831 case Instruction::Load:
4832 if (SVEGatherOverhead.getNumOccurrences() > 0)
4833 return SVEGatherOverhead;
4834 return ST->getGatherOverhead();
4835 break;
4836 case Instruction::Store:
4837 if (SVEScatterOverhead.getNumOccurrences() > 0)
4838 return SVEScatterOverhead;
4839 return ST->getScatterOverhead();
4840 break;
4841 default:
4842 llvm_unreachable("Shouldn't have reached here");
4843 }
4844}
4845
4849
4850 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4851 MICA.getID() == Intrinsic::vp_gather)
4852 ? Instruction::Load
4853 : Instruction::Store;
4854
4855 Type *DataTy = MICA.getDataType();
4856 Align Alignment = MICA.getAlignment();
4857 const Instruction *I = MICA.getInst();
4858
4859 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4861 auto *VT = cast<VectorType>(DataTy);
4862 auto LT = getTypeLegalizationCost(DataTy);
4863 if (!LT.first.isValid())
4865
4866 // Return an invalid cost for element types that we are unable to lower.
4867 if (!LT.second.isVector() ||
4868 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4869 VT->getElementType()->isIntegerTy(1))
4871
4872 // The code-generator is currently not able to handle scalable vectors
4873 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4874 // it. This change will be removed when code-generation for these types is
4875 // sufficiently reliable.
4876 if (VT->getElementCount() == ElementCount::getScalable(1))
4878
4879 ElementCount LegalVF = LT.second.getVectorElementCount();
4880 InstructionCost MemOpCost =
4881 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4882 {TTI::OK_AnyValue, TTI::OP_None}, I);
4883 // Add on an overhead cost for using gathers/scatters.
4884 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4885 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4886}
4887
4889 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4890}
4891
4893 Align Alignment,
4894 unsigned AddressSpace,
4896 TTI::OperandValueInfo OpInfo,
4897 const Instruction *I) const {
4898 EVT VT = TLI->getValueType(DL, Ty, true);
4899 // Type legalization can't handle structs
4900 if (VT == MVT::Other)
4901 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4902 CostKind);
4903
4904 auto LT = getTypeLegalizationCost(Ty);
4905 if (!LT.first.isValid())
4907
4908 // The code-generator is currently not able to handle scalable vectors
4909 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4910 // it. This change will be removed when code-generation for these types is
4911 // sufficiently reliable.
4912 // We also only support full register predicate loads and stores.
4913 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4914 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4915 (VTy->getElementType()->isIntegerTy(1) &&
4916 !VTy->getElementCount().isKnownMultipleOf(
4919
4920 // TODO: consider latency as well for TCK_SizeAndLatency.
4922 return LT.first;
4923
4925 return 1;
4926
4927 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4928 LT.second.is128BitVector() && Alignment < Align(16)) {
4929 // Unaligned stores are extremely inefficient. We don't split all
4930 // unaligned 128-bit stores because the negative impact that has shown in
4931 // practice on inlined block copy code.
4932 // We make such stores expensive so that we will only vectorize if there
4933 // are 6 other instructions getting vectorized.
4934 const int AmortizationCost = 6;
4935
4936 return LT.first * 2 * AmortizationCost;
4937 }
4938
4939 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4940 if (Ty->isPtrOrPtrVectorTy())
4941 return LT.first;
4942
4943 if (useNeonVector(Ty)) {
4944 // Check truncating stores and extending loads.
4945 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4946 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4947 if (VT == MVT::v4i8)
4948 return 2;
4949 // Otherwise we need to scalarize.
4950 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4951 }
4952 EVT EltVT = VT.getVectorElementType();
4953 unsigned EltSize = EltVT.getScalarSizeInBits();
4954 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4955 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4956 return LT.first;
4957 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4958 // widening to v4i8, which produces suboptimal results.
4959 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4960 return LT.first;
4961
4962 // Check non-power-of-2 loads/stores for legal vector element types with
4963 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4964 // operations on smaller power-of-2 ops, including ld1/st1.
4965 LLVMContext &C = Ty->getContext();
4967 SmallVector<EVT> TypeWorklist;
4968 TypeWorklist.push_back(VT);
4969 while (!TypeWorklist.empty()) {
4970 EVT CurrVT = TypeWorklist.pop_back_val();
4971 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4972 if (isPowerOf2_32(CurrNumElements)) {
4973 Cost += 1;
4974 continue;
4975 }
4976
4977 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4978 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4979 TypeWorklist.push_back(
4980 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4981 }
4982 return Cost;
4983 }
4984
4985 return LT.first;
4986}
4987
4989 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4990 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4991 bool UseMaskForCond, bool UseMaskForGaps) const {
4992 assert(Factor >= 2 && "Invalid interleave factor");
4993 auto *VecVTy = cast<VectorType>(VecTy);
4994
4995 if (VecTy->isScalableTy() && !ST->hasSVE())
4997
4998 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4999 // only have lowering for power-of-2 factors.
5000 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5001 // InterleavedAccessPass for ld3/st3
5002 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5004
5005 // Vectorization for masked interleaved accesses is only enabled for scalable
5006 // VF.
5007 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5009
5010 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5011 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5012 auto *SubVecTy =
5013 VectorType::get(VecVTy->getElementType(),
5014 VecVTy->getElementCount().divideCoefficientBy(Factor));
5015
5016 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5017 // Accesses having vector types that are a multiple of 128 bits can be
5018 // matched to more than one ldN/stN instruction.
5019 bool UseScalable;
5020 if (MinElts % Factor == 0 &&
5021 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5022 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5023 }
5024
5025 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5026 Alignment, AddressSpace, CostKind,
5027 UseMaskForCond, UseMaskForGaps);
5028}
5029
5034 for (auto *I : Tys) {
5035 if (!I->isVectorTy())
5036 continue;
5037 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5038 128)
5039 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5040 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5041 }
5042 return Cost;
5043}
5044
5046 return ST->getMaxInterleaveFactor();
5047}
5048
5049// For Falkor, we want to avoid having too many strided loads in a loop since
5050// that can exhaust the HW prefetcher resources. We adjust the unroller
5051// MaxCount preference below to attempt to ensure unrolling doesn't create too
5052// many strided loads.
5053static void
5056 enum { MaxStridedLoads = 7 };
5057 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5058 int StridedLoads = 0;
5059 // FIXME? We could make this more precise by looking at the CFG and
5060 // e.g. not counting loads in each side of an if-then-else diamond.
5061 for (const auto BB : L->blocks()) {
5062 for (auto &I : *BB) {
5063 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5064 if (!LMemI)
5065 continue;
5066
5067 Value *PtrValue = LMemI->getPointerOperand();
5068 if (L->isLoopInvariant(PtrValue))
5069 continue;
5070
5071 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5072 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5073 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5074 continue;
5075
5076 // FIXME? We could take pairing of unrolled load copies into account
5077 // by looking at the AddRec, but we would probably have to limit this
5078 // to loops with no stores or other memory optimization barriers.
5079 ++StridedLoads;
5080 // We've seen enough strided loads that seeing more won't make a
5081 // difference.
5082 if (StridedLoads > MaxStridedLoads / 2)
5083 return StridedLoads;
5084 }
5085 }
5086 return StridedLoads;
5087 };
5088
5089 int StridedLoads = countStridedLoads(L, SE);
5090 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5091 << " strided loads\n");
5092 // Pick the largest power of 2 unroll count that won't result in too many
5093 // strided loads.
5094 if (StridedLoads) {
5095 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5096 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5097 << UP.MaxCount << '\n');
5098 }
5099}
5100
5101// This function returns true if the loop:
5102// 1. Has a valid cost, and
5103// 2. Has a cost within the supplied budget.
5104// Otherwise it returns false.
5106 InstructionCost Budget,
5107 unsigned *FinalSize) {
5108 // Estimate the size of the loop.
5109 InstructionCost LoopCost = 0;
5110
5111 for (auto *BB : L->getBlocks()) {
5112 for (auto &I : *BB) {
5113 SmallVector<const Value *, 4> Operands(I.operand_values());
5114 InstructionCost Cost =
5115 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5116 // This can happen with intrinsics that don't currently have a cost model
5117 // or for some operations that require SVE.
5118 if (!Cost.isValid())
5119 return false;
5120
5121 LoopCost += Cost;
5122 if (LoopCost > Budget)
5123 return false;
5124 }
5125 }
5126
5127 if (FinalSize)
5128 *FinalSize = LoopCost.getValue();
5129 return true;
5130}
5131
5133 const AArch64TTIImpl &TTI) {
5134 // Only consider loops with unknown trip counts for which we can determine
5135 // a symbolic expression. Multi-exit loops with small known trip counts will
5136 // likely be unrolled anyway.
5137 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5139 return false;
5140
5141 // It might not be worth unrolling loops with low max trip counts. Restrict
5142 // this to max trip counts > 32 for now.
5143 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5144 if (MaxTC > 0 && MaxTC <= 32)
5145 return false;
5146
5147 // Make sure the loop size is <= 5.
5148 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5149 return false;
5150
5151 // Small search loops with multiple exits can be highly beneficial to unroll.
5152 // We only care about loops with exactly two exiting blocks, although each
5153 // block could jump to the same exit block.
5154 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5155 if (Blocks.size() != 2)
5156 return false;
5157
5158 if (any_of(Blocks, [](BasicBlock *BB) {
5159 return !isa<BranchInst>(BB->getTerminator());
5160 }))
5161 return false;
5162
5163 return true;
5164}
5165
5166/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5167/// OOO engine's wide instruction window and various predictors.
5168static void
5171 const AArch64TTIImpl &TTI) {
5172 // Limit loops with structure that is highly likely to benefit from runtime
5173 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5174 // likely with complex control flow). Note that the heuristics here may be
5175 // overly conservative and we err on the side of avoiding runtime unrolling
5176 // rather than unroll excessively. They are all subject to further refinement.
5177 if (!L->isInnermost() || L->getNumBlocks() > 8)
5178 return;
5179
5180 // Loops with multiple exits are handled by common code.
5181 if (!L->getExitBlock())
5182 return;
5183
5184 // Check if the loop contains any reductions that could be parallelized when
5185 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5186 // a multiple of 2.
5187 bool HasParellelizableReductions =
5188 L->getNumBlocks() == 1 &&
5189 any_of(L->getHeader()->phis(),
5190 [&SE, L](PHINode &Phi) {
5191 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5192 }) &&
5193 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5194 if (HasParellelizableReductions &&
5195 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5196 UP.Partial = true;
5197 UP.MaxCount = 4;
5198 UP.AddAdditionalAccumulators = true;
5199 }
5200
5201 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5203 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5204 SE.getSmallConstantMaxTripCount(L) <= 32))
5205 return;
5206
5207 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5208 return;
5209
5211 return;
5212
5213 // Limit to loops with trip counts that are cheap to expand.
5214 UP.SCEVExpansionBudget = 1;
5215
5216 if (HasParellelizableReductions) {
5217 UP.Runtime = true;
5219 UP.AddAdditionalAccumulators = true;
5220 }
5221
5222 // Try to unroll small loops, of few-blocks with low budget, if they have
5223 // load/store dependencies, to expose more parallel memory access streams,
5224 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5225 BasicBlock *Header = L->getHeader();
5226 BasicBlock *Latch = L->getLoopLatch();
5227 if (Header == Latch) {
5228 // Estimate the size of the loop.
5229 unsigned Size;
5230 unsigned Width = 10;
5231 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5232 return;
5233
5234 // Try to find an unroll count that maximizes the use of the instruction
5235 // window, i.e. trying to fetch as many instructions per cycle as possible.
5236 unsigned MaxInstsPerLine = 16;
5237 unsigned UC = 1;
5238 unsigned BestUC = 1;
5239 unsigned SizeWithBestUC = BestUC * Size;
5240 while (UC <= 8) {
5241 unsigned SizeWithUC = UC * Size;
5242 if (SizeWithUC > 48)
5243 break;
5244 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5245 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5246 BestUC = UC;
5247 SizeWithBestUC = BestUC * Size;
5248 }
5249 UC++;
5250 }
5251
5252 if (BestUC == 1)
5253 return;
5254
5255 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5257 for (auto *BB : L->blocks()) {
5258 for (auto &I : *BB) {
5260 if (!Ptr)
5261 continue;
5262 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5263 if (SE.isLoopInvariant(PtrSCEV, L))
5264 continue;
5265 if (isa<LoadInst>(&I)) {
5266 LoadedValuesPlus.insert(&I);
5267 // Include in-loop 1st users of loaded values.
5268 for (auto *U : I.users())
5269 if (L->contains(cast<Instruction>(U)))
5270 LoadedValuesPlus.insert(U);
5271 } else
5272 Stores.push_back(cast<StoreInst>(&I));
5273 }
5274 }
5275
5276 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5277 return LoadedValuesPlus.contains(SI->getOperand(0));
5278 }))
5279 return;
5280
5281 UP.Runtime = true;
5282 UP.DefaultUnrollRuntimeCount = BestUC;
5283 return;
5284 }
5285
5286 // Try to runtime-unroll loops with early-continues depending on loop-varying
5287 // loads; this helps with branch-prediction for the early-continues.
5288 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5290 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5291 !llvm::is_contained(Preds, Header) ||
5292 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5293 return;
5294
5295 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5296 [&](Instruction *I, unsigned Depth) -> bool {
5297 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5298 return false;
5299
5300 if (isa<LoadInst>(I))
5301 return true;
5302
5303 return any_of(I->operands(), [&](Value *V) {
5304 auto *I = dyn_cast<Instruction>(V);
5305 return I && DependsOnLoopLoad(I, Depth + 1);
5306 });
5307 };
5308 CmpPredicate Pred;
5309 Instruction *I;
5310 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5311 m_Value())) &&
5312 DependsOnLoopLoad(I, 0)) {
5313 UP.Runtime = true;
5314 }
5315}
5316
5319 OptimizationRemarkEmitter *ORE) const {
5320 // Enable partial unrolling and runtime unrolling.
5321 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5322
5323 UP.UpperBound = true;
5324
5325 // For inner loop, it is more likely to be a hot one, and the runtime check
5326 // can be promoted out from LICM pass, so the overhead is less, let's try
5327 // a larger threshold to unroll more loops.
5328 if (L->getLoopDepth() > 1)
5329 UP.PartialThreshold *= 2;
5330
5331 // Disable partial & runtime unrolling on -Os.
5333
5334 // Scan the loop: don't unroll loops with calls as this could prevent
5335 // inlining. Don't unroll auto-vectorized loops either, though do allow
5336 // unrolling of the scalar remainder.
5337 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5339 for (auto *BB : L->getBlocks()) {
5340 for (auto &I : *BB) {
5341 // Both auto-vectorized loops and the scalar remainder have the
5342 // isvectorized attribute, so differentiate between them by the presence
5343 // of vector instructions.
5344 if (IsVectorized && I.getType()->isVectorTy())
5345 return;
5346 if (isa<CallBase>(I)) {
5349 if (!isLoweredToCall(F))
5350 continue;
5351 return;
5352 }
5353
5354 SmallVector<const Value *, 4> Operands(I.operand_values());
5355 Cost += getInstructionCost(&I, Operands,
5357 }
5358 }
5359
5360 // Apply subtarget-specific unrolling preferences.
5361 if (ST->isAppleMLike())
5362 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5363 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5366
5367 // If this is a small, multi-exit loop similar to something like std::find,
5368 // then there is typically a performance improvement achieved by unrolling.
5369 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5370 UP.RuntimeUnrollMultiExit = true;
5371 UP.Runtime = true;
5372 // Limit unroll count.
5374 // Allow slightly more costly trip-count expansion to catch search loops
5375 // with pointer inductions.
5376 UP.SCEVExpansionBudget = 5;
5377 return;
5378 }
5379
5380 // Enable runtime unrolling for in-order models
5381 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5382 // checking for that case, we can ensure that the default behaviour is
5383 // unchanged
5384 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5385 !ST->getSchedModel().isOutOfOrder()) {
5386 UP.Runtime = true;
5387 UP.Partial = true;
5388 UP.UnrollRemainder = true;
5390
5391 UP.UnrollAndJam = true;
5393 }
5394
5395 // Force unrolling small loops can be very useful because of the branch
5396 // taken cost of the backedge.
5398 UP.Force = true;
5399}
5400
5405
5407 Type *ExpectedType,
5408 bool CanCreate) const {
5409 switch (Inst->getIntrinsicID()) {
5410 default:
5411 return nullptr;
5412 case Intrinsic::aarch64_neon_st2:
5413 case Intrinsic::aarch64_neon_st3:
5414 case Intrinsic::aarch64_neon_st4: {
5415 // Create a struct type
5416 StructType *ST = dyn_cast<StructType>(ExpectedType);
5417 if (!CanCreate || !ST)
5418 return nullptr;
5419 unsigned NumElts = Inst->arg_size() - 1;
5420 if (ST->getNumElements() != NumElts)
5421 return nullptr;
5422 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5423 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5424 return nullptr;
5425 }
5426 Value *Res = PoisonValue::get(ExpectedType);
5427 IRBuilder<> Builder(Inst);
5428 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5429 Value *L = Inst->getArgOperand(i);
5430 Res = Builder.CreateInsertValue(Res, L, i);
5431 }
5432 return Res;
5433 }
5434 case Intrinsic::aarch64_neon_ld2:
5435 case Intrinsic::aarch64_neon_ld3:
5436 case Intrinsic::aarch64_neon_ld4:
5437 if (Inst->getType() == ExpectedType)
5438 return Inst;
5439 return nullptr;
5440 }
5441}
5442
5444 MemIntrinsicInfo &Info) const {
5445 switch (Inst->getIntrinsicID()) {
5446 default:
5447 break;
5448 case Intrinsic::aarch64_neon_ld2:
5449 case Intrinsic::aarch64_neon_ld3:
5450 case Intrinsic::aarch64_neon_ld4:
5451 Info.ReadMem = true;
5452 Info.WriteMem = false;
5453 Info.PtrVal = Inst->getArgOperand(0);
5454 break;
5455 case Intrinsic::aarch64_neon_st2:
5456 case Intrinsic::aarch64_neon_st3:
5457 case Intrinsic::aarch64_neon_st4:
5458 Info.ReadMem = false;
5459 Info.WriteMem = true;
5460 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5461 break;
5462 }
5463
5464 switch (Inst->getIntrinsicID()) {
5465 default:
5466 return false;
5467 case Intrinsic::aarch64_neon_ld2:
5468 case Intrinsic::aarch64_neon_st2:
5469 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5470 break;
5471 case Intrinsic::aarch64_neon_ld3:
5472 case Intrinsic::aarch64_neon_st3:
5473 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5474 break;
5475 case Intrinsic::aarch64_neon_ld4:
5476 case Intrinsic::aarch64_neon_st4:
5477 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5478 break;
5479 }
5480 return true;
5481}
5482
5483/// See if \p I should be considered for address type promotion. We check if \p
5484/// I is a sext with right type and used in memory accesses. If it used in a
5485/// "complex" getelementptr, we allow it to be promoted without finding other
5486/// sext instructions that sign extended the same initial value. A getelementptr
5487/// is considered as "complex" if it has more than 2 operands.
5489 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5490 bool Considerable = false;
5491 AllowPromotionWithoutCommonHeader = false;
5492 if (!isa<SExtInst>(&I))
5493 return false;
5494 Type *ConsideredSExtType =
5495 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5496 if (I.getType() != ConsideredSExtType)
5497 return false;
5498 // See if the sext is the one with the right type and used in at least one
5499 // GetElementPtrInst.
5500 for (const User *U : I.users()) {
5501 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5502 Considerable = true;
5503 // A getelementptr is considered as "complex" if it has more than 2
5504 // operands. We will promote a SExt used in such complex GEP as we
5505 // expect some computation to be merged if they are done on 64 bits.
5506 if (GEPInst->getNumOperands() > 2) {
5507 AllowPromotionWithoutCommonHeader = true;
5508 break;
5509 }
5510 }
5511 }
5512 return Considerable;
5513}
5514
5516 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5517 if (!VF.isScalable())
5518 return true;
5519
5520 Type *Ty = RdxDesc.getRecurrenceType();
5521 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5522 return false;
5523
5524 switch (RdxDesc.getRecurrenceKind()) {
5525 case RecurKind::Sub:
5527 case RecurKind::Add:
5528 case RecurKind::FAdd:
5529 case RecurKind::And:
5530 case RecurKind::Or:
5531 case RecurKind::Xor:
5532 case RecurKind::SMin:
5533 case RecurKind::SMax:
5534 case RecurKind::UMin:
5535 case RecurKind::UMax:
5536 case RecurKind::FMin:
5537 case RecurKind::FMax:
5538 case RecurKind::FMulAdd:
5539 case RecurKind::AnyOf:
5541 return true;
5542 default:
5543 return false;
5544 }
5545}
5546
5549 FastMathFlags FMF,
5551 // The code-generator is currently not able to handle scalable vectors
5552 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5553 // it. This change will be removed when code-generation for these types is
5554 // sufficiently reliable.
5555 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5556 if (VTy->getElementCount() == ElementCount::getScalable(1))
5558
5559 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5560
5561 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5562 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5563
5564 InstructionCost LegalizationCost = 0;
5565 if (LT.first > 1) {
5566 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5567 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5568 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5569 }
5570
5571 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5572}
5573
5575 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5576 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5577 InstructionCost LegalizationCost = 0;
5578 if (LT.first > 1) {
5579 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5580 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5581 LegalizationCost *= LT.first - 1;
5582 }
5583
5584 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5585 assert(ISD && "Invalid opcode");
5586 // Add the final reduction cost for the legal horizontal reduction
5587 switch (ISD) {
5588 case ISD::ADD:
5589 case ISD::AND:
5590 case ISD::OR:
5591 case ISD::XOR:
5592 case ISD::FADD:
5593 return LegalizationCost + 2;
5594 default:
5596 }
5597}
5598
5601 std::optional<FastMathFlags> FMF,
5603 // The code-generator is currently not able to handle scalable vectors
5604 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5605 // it. This change will be removed when code-generation for these types is
5606 // sufficiently reliable.
5607 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5608 if (VTy->getElementCount() == ElementCount::getScalable(1))
5610
5612 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5613 InstructionCost BaseCost =
5614 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5615 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5616 // end up vectorizing for more computationally intensive loops.
5617 return BaseCost + FixedVTy->getNumElements();
5618 }
5619
5620 if (Opcode != Instruction::FAdd)
5622
5623 auto *VTy = cast<ScalableVectorType>(ValTy);
5625 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5626 Cost *= getMaxNumElements(VTy->getElementCount());
5627 return Cost;
5628 }
5629
5630 if (isa<ScalableVectorType>(ValTy))
5631 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5632
5633 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5634 MVT MTy = LT.second;
5635 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5636 assert(ISD && "Invalid opcode");
5637
5638 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5639 // instructions as twice a normal vector add, plus 1 for each legalization
5640 // step (LT.first). This is the only arithmetic vector reduction operation for
5641 // which we have an instruction.
5642 // OR, XOR and AND costs should match the codegen from:
5643 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5644 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5645 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5646 static const CostTblEntry CostTblNoPairwise[]{
5647 {ISD::ADD, MVT::v8i8, 2},
5648 {ISD::ADD, MVT::v16i8, 2},
5649 {ISD::ADD, MVT::v4i16, 2},
5650 {ISD::ADD, MVT::v8i16, 2},
5651 {ISD::ADD, MVT::v2i32, 2},
5652 {ISD::ADD, MVT::v4i32, 2},
5653 {ISD::ADD, MVT::v2i64, 2},
5654 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5655 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5656 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5657 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5658 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5659 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5660 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5661 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5662 {ISD::XOR, MVT::v16i8, 7},
5663 {ISD::XOR, MVT::v4i16, 4},
5664 {ISD::XOR, MVT::v8i16, 6},
5665 {ISD::XOR, MVT::v2i32, 3},
5666 {ISD::XOR, MVT::v4i32, 5},
5667 {ISD::XOR, MVT::v2i64, 3},
5668 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5669 {ISD::AND, MVT::v16i8, 7},
5670 {ISD::AND, MVT::v4i16, 4},
5671 {ISD::AND, MVT::v8i16, 6},
5672 {ISD::AND, MVT::v2i32, 3},
5673 {ISD::AND, MVT::v4i32, 5},
5674 {ISD::AND, MVT::v2i64, 3},
5675 };
5676 switch (ISD) {
5677 default:
5678 break;
5679 case ISD::FADD:
5680 if (Type *EltTy = ValTy->getScalarType();
5681 // FIXME: For half types without fullfp16 support, this could extend and
5682 // use a fp32 faddp reduction but current codegen unrolls.
5683 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5684 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5685 const unsigned NElts = MTy.getVectorNumElements();
5686 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5687 isPowerOf2_32(NElts))
5688 // Reduction corresponding to series of fadd instructions is lowered to
5689 // series of faddp instructions. faddp has latency/throughput that
5690 // matches fadd instruction and hence, every faddp instruction can be
5691 // considered to have a relative cost = 1 with
5692 // CostKind = TCK_RecipThroughput.
5693 // An faddp will pairwise add vector elements, so the size of input
5694 // vector reduces by half every time, requiring
5695 // #(faddp instructions) = log2_32(NElts).
5696 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5697 }
5698 break;
5699 case ISD::ADD:
5700 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5701 return (LT.first - 1) + Entry->Cost;
5702 break;
5703 case ISD::XOR:
5704 case ISD::AND:
5705 case ISD::OR:
5706 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5707 if (!Entry)
5708 break;
5709 auto *ValVTy = cast<FixedVectorType>(ValTy);
5710 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5711 isPowerOf2_32(ValVTy->getNumElements())) {
5712 InstructionCost ExtraCost = 0;
5713 if (LT.first != 1) {
5714 // Type needs to be split, so there is an extra cost of LT.first - 1
5715 // arithmetic ops.
5716 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5717 MTy.getVectorNumElements());
5718 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5719 ExtraCost *= LT.first - 1;
5720 }
5721 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5722 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5723 return Cost + ExtraCost;
5724 }
5725 break;
5726 }
5727 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5728}
5729
5731 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5732 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5733 EVT VecVT = TLI->getValueType(DL, VecTy);
5734 EVT ResVT = TLI->getValueType(DL, ResTy);
5735
5736 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5737 VecVT.getSizeInBits() >= 64) {
5738 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5739
5740 // The legal cases are:
5741 // UADDLV 8/16/32->32
5742 // UADDLP 32->64
5743 unsigned RevVTSize = ResVT.getSizeInBits();
5744 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5745 RevVTSize <= 32) ||
5746 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5747 RevVTSize <= 32) ||
5748 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5749 RevVTSize <= 64))
5750 return (LT.first - 1) * 2 + 2;
5751 }
5752
5753 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5754 CostKind);
5755}
5756
5758AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5759 Type *ResTy, VectorType *VecTy,
5761 EVT VecVT = TLI->getValueType(DL, VecTy);
5762 EVT ResVT = TLI->getValueType(DL, ResTy);
5763
5764 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5765 RedOpcode == Instruction::Add) {
5766 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5767
5768 // The legal cases with dotprod are
5769 // UDOT 8->32
5770 // Which requires an additional uaddv to sum the i32 values.
5771 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5772 ResVT == MVT::i32)
5773 return LT.first + 2;
5774 }
5775
5776 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5777 CostKind);
5778}
5779
5783 static const CostTblEntry ShuffleTbl[] = {
5784 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5785 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5786 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5787 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5788 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5789 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5790 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5791 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5792 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5793 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5794 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5795 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5796 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5797 };
5798
5799 // The code-generator is currently not able to handle scalable vectors
5800 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5801 // it. This change will be removed when code-generation for these types is
5802 // sufficiently reliable.
5805
5806 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5807 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5808 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5809 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5810 : LT.second;
5811 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5812 InstructionCost LegalizationCost = 0;
5813 if (Index < 0) {
5814 LegalizationCost =
5815 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5817 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5819 }
5820
5821 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5822 // Cost performed on a promoted type.
5823 if (LT.second.getScalarType() == MVT::i1) {
5824 LegalizationCost +=
5825 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5827 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5829 }
5830 const auto *Entry =
5831 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5832 assert(Entry && "Illegal Type for Splice");
5833 LegalizationCost += Entry->Cost;
5834 return LegalizationCost * LT.first;
5835}
5836
5838 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5840 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5843
5845 return Invalid;
5846
5847 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5848 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5849 return Invalid;
5850
5851 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5852 OpAExtend == TTI::PR_None)
5853 return Invalid;
5854
5855 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5856 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5857 "Unexpected values for OpBExtend or InputTypeB");
5858
5859 // We only support multiply binary operations for now, and for muls we
5860 // require the types being extended to be the same.
5861 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5862 return Invalid;
5863
5864 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5865 if (IsUSDot && !ST->hasMatMulInt8())
5866 return Invalid;
5867
5868 unsigned Ratio =
5869 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5870 if (VF.getKnownMinValue() <= Ratio)
5871 return Invalid;
5872
5873 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5874 VectorType *AccumVectorType =
5875 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5876 // We don't yet support all kinds of legalization.
5877 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5878 EVT::getEVT(AccumVectorType));
5879 switch (TC.first) {
5880 default:
5881 return Invalid;
5885 // The legalised type (e.g. after splitting) must be legal too.
5886 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5888 return Invalid;
5889 break;
5890 }
5891
5892 std::pair<InstructionCost, MVT> AccumLT =
5893 getTypeLegalizationCost(AccumVectorType);
5894 std::pair<InstructionCost, MVT> InputLT =
5895 getTypeLegalizationCost(InputVectorType);
5896
5897 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5898
5899 // Prefer using full types by costing half-full input types as more expensive.
5900 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5902 // FIXME: This can be removed after the cost of the extends are folded into
5903 // the dot-product expression in VPlan, after landing:
5904 // https://github.com/llvm/llvm-project/pull/147302
5905 Cost *= 2;
5906
5907 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5908 // i16 -> i64 is natively supported for udot/sdot
5909 if (AccumLT.second.getScalarType() == MVT::i64 &&
5910 InputLT.second.getScalarType() == MVT::i16)
5911 return Cost;
5912 // i16 -> i32 is natively supported with SVE2p1
5913 if (AccumLT.second.getScalarType() == MVT::i32 &&
5914 InputLT.second.getScalarType() == MVT::i16 &&
5915 (ST->hasSVE2p1() || ST->hasSME2()))
5916 return Cost;
5917 // i8 -> i64 is supported with an extra level of extends
5918 if (AccumLT.second.getScalarType() == MVT::i64 &&
5919 InputLT.second.getScalarType() == MVT::i8)
5920 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5921 // because it requires two extra extends on the inputs. But if we'd change
5922 // that now, a regular reduction would be cheaper because the costs of
5923 // the extends in the IR are still counted. This can be fixed
5924 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5925 return Cost;
5926 }
5927
5928 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5929 if (ST->isSVEorStreamingSVEAvailable() ||
5930 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5931 ST->hasDotProd())) {
5932 if (AccumLT.second.getScalarType() == MVT::i32 &&
5933 InputLT.second.getScalarType() == MVT::i8)
5934 return Cost;
5935 }
5936
5937 // Add additional cost for the extends that would need to be inserted.
5938 return Cost + 2;
5939}
5940
5943 VectorType *SrcTy, ArrayRef<int> Mask,
5944 TTI::TargetCostKind CostKind, int Index,
5946 const Instruction *CxtI) const {
5947 assert((Mask.empty() || DstTy->isScalableTy() ||
5948 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5949 "Expected the Mask to match the return size if given");
5950 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5951 "Expected the same scalar types");
5952 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5953
5954 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5955 // into smaller vectors and sum the cost of each shuffle.
5956 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5957 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5958 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5959 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5960 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5961 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5962 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5963 // cost than just the load.
5964 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5967 return std::max<InstructionCost>(1, LT.first / 4);
5968
5969 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5970 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5971 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5972 // cost than just the store.
5973 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5975 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5977 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5978 return LT.first;
5979
5980 unsigned TpNumElts = Mask.size();
5981 unsigned LTNumElts = LT.second.getVectorNumElements();
5982 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5983 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5984 LT.second.getVectorElementCount());
5986 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5987 PreviousCosts;
5988 for (unsigned N = 0; N < NumVecs; N++) {
5989 SmallVector<int> NMask;
5990 // Split the existing mask into chunks of size LTNumElts. Track the source
5991 // sub-vectors to ensure the result has at most 2 inputs.
5992 unsigned Source1 = -1U, Source2 = -1U;
5993 unsigned NumSources = 0;
5994 for (unsigned E = 0; E < LTNumElts; E++) {
5995 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5997 if (MaskElt < 0) {
5999 continue;
6000 }
6001
6002 // Calculate which source from the input this comes from and whether it
6003 // is new to us.
6004 unsigned Source = MaskElt / LTNumElts;
6005 if (NumSources == 0) {
6006 Source1 = Source;
6007 NumSources = 1;
6008 } else if (NumSources == 1 && Source != Source1) {
6009 Source2 = Source;
6010 NumSources = 2;
6011 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6012 NumSources++;
6013 }
6014
6015 // Add to the new mask. For the NumSources>2 case these are not correct,
6016 // but are only used for the modular lane number.
6017 if (Source == Source1)
6018 NMask.push_back(MaskElt % LTNumElts);
6019 else if (Source == Source2)
6020 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6021 else
6022 NMask.push_back(MaskElt % LTNumElts);
6023 }
6024 // Check if we have already generated this sub-shuffle, which means we
6025 // will have already generated the output. For example a <16 x i32> splat
6026 // will be the same sub-splat 4 times, which only needs to be generated
6027 // once and reused.
6028 auto Result =
6029 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6030 // Check if it was already in the map (already costed).
6031 if (!Result.second)
6032 continue;
6033 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6034 // getShuffleCost. If not then cost it using the worst case as the number
6035 // of element moves into a new vector.
6036 InstructionCost NCost =
6037 NumSources <= 2
6038 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6040 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6041 CxtI)
6042 : LTNumElts;
6043 Result.first->second = NCost;
6044 Cost += NCost;
6045 }
6046 return Cost;
6047 }
6048
6049 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6050 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6051 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6052 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6053 // This currently only handles low or high extracts to prevent SLP vectorizer
6054 // regressions.
6055 // Note that SVE's ext instruction is destructive, but it can be fused with
6056 // a movprfx to act like a constructive instruction.
6057 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6058 if (LT.second.getFixedSizeInBits() >= 128 &&
6059 cast<FixedVectorType>(SubTp)->getNumElements() ==
6060 LT.second.getVectorNumElements() / 2) {
6061 if (Index == 0)
6062 return 0;
6063 if (Index == (int)LT.second.getVectorNumElements() / 2)
6064 return 1;
6065 }
6067 }
6068 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6069 // the code to handle length-changing shuffles.
6070 if (Kind == TTI::SK_InsertSubvector) {
6071 LT = getTypeLegalizationCost(DstTy);
6072 SrcTy = DstTy;
6073 }
6074
6075 // Check for identity masks, which we can treat as free for both fixed and
6076 // scalable vector paths.
6077 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6078 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6079 all_of(enumerate(Mask), [](const auto &M) {
6080 return M.value() < 0 || M.value() == (int)M.index();
6081 }))
6082 return 0;
6083
6084 // Segmented shuffle matching.
6085 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6086 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6087 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6089
6091 unsigned Segments =
6093 unsigned SegmentElts = VTy->getNumElements() / Segments;
6094
6095 // dupq zd.t, zn.t[idx]
6096 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6097 ST->isSVEorStreamingSVEAvailable() &&
6098 isDUPQMask(Mask, Segments, SegmentElts))
6099 return LT.first;
6100
6101 // mov zd.q, vn
6102 if (ST->isSVEorStreamingSVEAvailable() &&
6103 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6104 return LT.first;
6105 }
6106
6107 // Check for broadcast loads, which are supported by the LD1R instruction.
6108 // In terms of code-size, the shuffle vector is free when a load + dup get
6109 // folded into a LD1R. That's what we check and return here. For performance
6110 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6111 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6112 // that we model the load + dup sequence slightly higher because LD1R is a
6113 // high latency instruction.
6114 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6115 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6116 if (IsLoad && LT.second.isVector() &&
6117 isLegalBroadcastLoad(SrcTy->getElementType(),
6118 LT.second.getVectorElementCount()))
6119 return 0;
6120 }
6121
6122 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6123 // from the perfect shuffle tables.
6124 if (Mask.size() == 4 &&
6125 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6126 (SrcTy->getScalarSizeInBits() == 16 ||
6127 SrcTy->getScalarSizeInBits() == 32) &&
6128 all_of(Mask, [](int E) { return E < 8; }))
6129 return getPerfectShuffleCost(Mask);
6130
6131 // Check for other shuffles that are not SK_ kinds but we have native
6132 // instructions for, for example ZIP and UZP.
6133 unsigned Unused;
6134 if (LT.second.isFixedLengthVector() &&
6135 LT.second.getVectorNumElements() == Mask.size() &&
6136 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6137 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6138 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6139 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6140 Kind == TTI::SK_InsertSubvector) &&
6141 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6142 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6143 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6144 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6145 LT.second.getVectorNumElements(), 16) ||
6146 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6147 LT.second.getVectorNumElements(), 32) ||
6148 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6149 LT.second.getVectorNumElements(), 64) ||
6150 // Check for non-zero lane splats
6151 all_of(drop_begin(Mask),
6152 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6153 return 1;
6154
6155 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6156 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6157 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6158 static const CostTblEntry ShuffleTbl[] = {
6159 // Broadcast shuffle kinds can be performed with 'dup'.
6160 {TTI::SK_Broadcast, MVT::v8i8, 1},
6161 {TTI::SK_Broadcast, MVT::v16i8, 1},
6162 {TTI::SK_Broadcast, MVT::v4i16, 1},
6163 {TTI::SK_Broadcast, MVT::v8i16, 1},
6164 {TTI::SK_Broadcast, MVT::v2i32, 1},
6165 {TTI::SK_Broadcast, MVT::v4i32, 1},
6166 {TTI::SK_Broadcast, MVT::v2i64, 1},
6167 {TTI::SK_Broadcast, MVT::v4f16, 1},
6168 {TTI::SK_Broadcast, MVT::v8f16, 1},
6169 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6170 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6171 {TTI::SK_Broadcast, MVT::v2f32, 1},
6172 {TTI::SK_Broadcast, MVT::v4f32, 1},
6173 {TTI::SK_Broadcast, MVT::v2f64, 1},
6174 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6175 // 'zip1/zip2' instructions.
6176 {TTI::SK_Transpose, MVT::v8i8, 1},
6177 {TTI::SK_Transpose, MVT::v16i8, 1},
6178 {TTI::SK_Transpose, MVT::v4i16, 1},
6179 {TTI::SK_Transpose, MVT::v8i16, 1},
6180 {TTI::SK_Transpose, MVT::v2i32, 1},
6181 {TTI::SK_Transpose, MVT::v4i32, 1},
6182 {TTI::SK_Transpose, MVT::v2i64, 1},
6183 {TTI::SK_Transpose, MVT::v4f16, 1},
6184 {TTI::SK_Transpose, MVT::v8f16, 1},
6185 {TTI::SK_Transpose, MVT::v4bf16, 1},
6186 {TTI::SK_Transpose, MVT::v8bf16, 1},
6187 {TTI::SK_Transpose, MVT::v2f32, 1},
6188 {TTI::SK_Transpose, MVT::v4f32, 1},
6189 {TTI::SK_Transpose, MVT::v2f64, 1},
6190 // Select shuffle kinds.
6191 // TODO: handle vXi8/vXi16.
6192 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6193 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6194 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6195 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6196 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6197 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6198 // PermuteSingleSrc shuffle kinds.
6199 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6200 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6201 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6202 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6203 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6204 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6205 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6206 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6207 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6208 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6209 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6210 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6211 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6212 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6213 // Reverse can be lowered with `rev`.
6214 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6215 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6216 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6217 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6218 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6219 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6220 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6221 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6222 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6223 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6224 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6225 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6226 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6227 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6228 // Splice can all be lowered as `ext`.
6229 {TTI::SK_Splice, MVT::v2i32, 1},
6230 {TTI::SK_Splice, MVT::v4i32, 1},
6231 {TTI::SK_Splice, MVT::v2i64, 1},
6232 {TTI::SK_Splice, MVT::v2f32, 1},
6233 {TTI::SK_Splice, MVT::v4f32, 1},
6234 {TTI::SK_Splice, MVT::v2f64, 1},
6235 {TTI::SK_Splice, MVT::v8f16, 1},
6236 {TTI::SK_Splice, MVT::v8bf16, 1},
6237 {TTI::SK_Splice, MVT::v8i16, 1},
6238 {TTI::SK_Splice, MVT::v16i8, 1},
6239 {TTI::SK_Splice, MVT::v4f16, 1},
6240 {TTI::SK_Splice, MVT::v4bf16, 1},
6241 {TTI::SK_Splice, MVT::v4i16, 1},
6242 {TTI::SK_Splice, MVT::v8i8, 1},
6243 // Broadcast shuffle kinds for scalable vectors
6244 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6245 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6246 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6247 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6248 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6249 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6250 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6251 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6252 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6253 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6254 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6255 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6256 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6257 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6258 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6259 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6260 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6261 // Handle the cases for vector.reverse with scalable vectors
6262 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6263 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6264 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6265 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6266 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6267 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6268 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6269 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6270 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6271 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6272 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6273 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6274 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6275 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6276 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6277 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6278 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6279 };
6280 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6281 return LT.first * Entry->Cost;
6282 }
6283
6284 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6285 return getSpliceCost(SrcTy, Index, CostKind);
6286
6287 // Inserting a subvector can often be done with either a D, S or H register
6288 // move, so long as the inserted vector is "aligned".
6289 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6290 LT.second.getSizeInBits() <= 128 && SubTp) {
6291 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6292 if (SubLT.second.isVector()) {
6293 int NumElts = LT.second.getVectorNumElements();
6294 int NumSubElts = SubLT.second.getVectorNumElements();
6295 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6296 return SubLT.first;
6297 }
6298 }
6299
6300 // Restore optimal kind.
6301 if (IsExtractSubvector)
6303 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6304 Args, CxtI);
6305}
6306
6309 const DominatorTree &DT) {
6310 const auto &Strides = DenseMap<Value *, const SCEV *>();
6311 for (BasicBlock *BB : TheLoop->blocks()) {
6312 // Scan the instructions in the block and look for addresses that are
6313 // consecutive and decreasing.
6314 for (Instruction &I : *BB) {
6315 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6317 Type *AccessTy = getLoadStoreType(&I);
6318 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6319 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6320 .value_or(0) < 0)
6321 return true;
6322 }
6323 }
6324 }
6325 return false;
6326}
6327
6329 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6331 // For cases like post-LTO vectorization, when we eventually know the trip
6332 // count, epilogue with fixed-width vectorization can be deleted if the trip
6333 // count is less than the epilogue iterations. That's why we prefer
6334 // fixed-width vectorization in epilogue in case of equal costs.
6335 if (IsEpilogue)
6336 return true;
6337 return ST->useFixedOverScalableIfEqualCost();
6338}
6339
6341 return ST->getEpilogueVectorizationMinVF();
6342}
6343
6345 if (!ST->hasSVE())
6346 return false;
6347
6348 // We don't currently support vectorisation with interleaving for SVE - with
6349 // such loops we're better off not using tail-folding. This gives us a chance
6350 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6351 if (TFI->IAI->hasGroups())
6352 return false;
6353
6355 if (TFI->LVL->getReductionVars().size())
6357 if (TFI->LVL->getFixedOrderRecurrences().size())
6359
6360 // We call this to discover whether any load/store pointers in the loop have
6361 // negative strides. This will require extra work to reverse the loop
6362 // predicate, which may be expensive.
6365 *TFI->LVL->getDominatorTree()))
6369
6370 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6371 Required))
6372 return false;
6373
6374 // Don't tail-fold for tight loops where we would be better off interleaving
6375 // with an unpredicated loop.
6376 unsigned NumInsns = 0;
6377 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6378 NumInsns += BB->sizeWithoutDebug();
6379 }
6380
6381 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6382 return NumInsns >= SVETailFoldInsnThreshold;
6383}
6384
6387 StackOffset BaseOffset, bool HasBaseReg,
6388 int64_t Scale, unsigned AddrSpace) const {
6389 // Scaling factors are not free at all.
6390 // Operands | Rt Latency
6391 // -------------------------------------------
6392 // Rt, [Xn, Xm] | 4
6393 // -------------------------------------------
6394 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6395 // Rt, [Xn, Wm, <extend> #imm] |
6397 AM.BaseGV = BaseGV;
6398 AM.BaseOffs = BaseOffset.getFixed();
6399 AM.HasBaseReg = HasBaseReg;
6400 AM.Scale = Scale;
6401 AM.ScalableOffset = BaseOffset.getScalable();
6402 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6403 // Scale represents reg2 * scale, thus account for 1 if
6404 // it is not equal to 0 or 1.
6405 return AM.Scale != 0 && AM.Scale != 1;
6407}
6408
6410 const Instruction *I) const {
6412 // For the binary operators (e.g. or) we need to be more careful than
6413 // selects, here we only transform them if they are already at a natural
6414 // break point in the code - the end of a block with an unconditional
6415 // terminator.
6416 if (I->getOpcode() == Instruction::Or &&
6417 isa<BranchInst>(I->getNextNode()) &&
6418 cast<BranchInst>(I->getNextNode())->isUnconditional())
6419 return true;
6420
6421 if (I->getOpcode() == Instruction::Add ||
6422 I->getOpcode() == Instruction::Sub)
6423 return true;
6424 }
6426}
6427
6430 const TargetTransformInfo::LSRCost &C2) const {
6431 // AArch64 specific here is adding the number of instructions to the
6432 // comparison (though not as the first consideration, as some targets do)
6433 // along with changing the priority of the base additions.
6434 // TODO: Maybe a more nuanced tradeoff between instruction count
6435 // and number of registers? To be investigated at a later date.
6436 if (EnableLSRCostOpt)
6437 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6438 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6439 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6440 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6441
6443}
6444
6445static bool isSplatShuffle(Value *V) {
6446 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6447 return all_equal(Shuf->getShuffleMask());
6448 return false;
6449}
6450
6451/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6452/// or upper half of the vector elements.
6453static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6454 bool AllowSplat = false) {
6455 // Scalable types can't be extract shuffle vectors.
6456 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6457 return false;
6458
6459 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6460 auto *FullTy = FullV->getType();
6461 auto *HalfTy = HalfV->getType();
6462 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6463 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6464 };
6465
6466 auto extractHalf = [](Value *FullV, Value *HalfV) {
6467 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6468 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6469 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6470 };
6471
6472 ArrayRef<int> M1, M2;
6473 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6474 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6475 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6476 return false;
6477
6478 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6479 // it is not checked as an extract below.
6480 if (AllowSplat && isSplatShuffle(Op1))
6481 S1Op1 = nullptr;
6482 if (AllowSplat && isSplatShuffle(Op2))
6483 S2Op1 = nullptr;
6484
6485 // Check that the operands are half as wide as the result and we extract
6486 // half of the elements of the input vectors.
6487 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6488 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6489 return false;
6490
6491 // Check the mask extracts either the lower or upper half of vector
6492 // elements.
6493 int M1Start = 0;
6494 int M2Start = 0;
6495 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6496 if ((S1Op1 &&
6497 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6498 (S2Op1 &&
6499 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6500 return false;
6501
6502 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6503 (M2Start != 0 && M2Start != (NumElements / 2)))
6504 return false;
6505 if (S1Op1 && S2Op1 && M1Start != M2Start)
6506 return false;
6507
6508 return true;
6509}
6510
6511/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6512/// of the vector elements.
6513static bool areExtractExts(Value *Ext1, Value *Ext2) {
6514 auto areExtDoubled = [](Instruction *Ext) {
6515 return Ext->getType()->getScalarSizeInBits() ==
6516 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6517 };
6518
6519 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6520 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6521 !areExtDoubled(cast<Instruction>(Ext1)) ||
6522 !areExtDoubled(cast<Instruction>(Ext2)))
6523 return false;
6524
6525 return true;
6526}
6527
6528/// Check if Op could be used with vmull_high_p64 intrinsic.
6530 Value *VectorOperand = nullptr;
6531 ConstantInt *ElementIndex = nullptr;
6532 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6533 m_ConstantInt(ElementIndex))) &&
6534 ElementIndex->getValue() == 1 &&
6535 isa<FixedVectorType>(VectorOperand->getType()) &&
6536 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6537}
6538
6539/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6540static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6542}
6543
6545 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6546 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6547 if (!GEP || GEP->getNumOperands() != 2)
6548 return false;
6549
6550 Value *Base = GEP->getOperand(0);
6551 Value *Offsets = GEP->getOperand(1);
6552
6553 // We only care about scalar_base+vector_offsets.
6554 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6555 return false;
6556
6557 // Sink extends that would allow us to use 32-bit offset vectors.
6558 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6559 auto *OffsetsInst = cast<Instruction>(Offsets);
6560 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6561 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6562 Ops.push_back(&GEP->getOperandUse(1));
6563 }
6564
6565 // Sink the GEP.
6566 return true;
6567}
6568
6569/// We want to sink following cases:
6570/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6571/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6573 if (match(Op, m_VScale()))
6574 return true;
6575 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6577 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6578 return true;
6579 }
6580 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6582 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6583 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6584 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6585 return true;
6586 }
6587 return false;
6588}
6589
6590static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
6591
6592/// Check if sinking \p I's operands to I's basic block is profitable, because
6593/// the operands can be folded into a target instruction, e.g.
6594/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6598 switch (II->getIntrinsicID()) {
6599 case Intrinsic::aarch64_neon_smull:
6600 case Intrinsic::aarch64_neon_umull:
6601 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6602 /*AllowSplat=*/true)) {
6603 Ops.push_back(&II->getOperandUse(0));
6604 Ops.push_back(&II->getOperandUse(1));
6605 return true;
6606 }
6607 [[fallthrough]];
6608
6609 case Intrinsic::fma:
6610 case Intrinsic::fmuladd:
6611 if (isa<VectorType>(I->getType()) &&
6612 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6613 !ST->hasFullFP16())
6614 return false;
6615
6616 if (isFNeg(II->getOperand(0)))
6617 Ops.push_back(&II->getOperandUse(0));
6618 if (isFNeg(II->getOperand(1)))
6619 Ops.push_back(&II->getOperandUse(1));
6620
6621 [[fallthrough]];
6622 case Intrinsic::aarch64_neon_sqdmull:
6623 case Intrinsic::aarch64_neon_sqdmulh:
6624 case Intrinsic::aarch64_neon_sqrdmulh:
6625 // Sink splats for index lane variants
6626 if (isSplatShuffle(II->getOperand(0)))
6627 Ops.push_back(&II->getOperandUse(0));
6628 if (isSplatShuffle(II->getOperand(1)))
6629 Ops.push_back(&II->getOperandUse(1));
6630 return !Ops.empty();
6631 case Intrinsic::aarch64_neon_fmlal:
6632 case Intrinsic::aarch64_neon_fmlal2:
6633 case Intrinsic::aarch64_neon_fmlsl:
6634 case Intrinsic::aarch64_neon_fmlsl2:
6635 // Sink splats for index lane variants
6636 if (isSplatShuffle(II->getOperand(1)))
6637 Ops.push_back(&II->getOperandUse(1));
6638 if (isSplatShuffle(II->getOperand(2)))
6639 Ops.push_back(&II->getOperandUse(2));
6640 return !Ops.empty();
6641 case Intrinsic::aarch64_sve_ptest_first:
6642 case Intrinsic::aarch64_sve_ptest_last:
6643 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6644 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6645 Ops.push_back(&II->getOperandUse(0));
6646 return !Ops.empty();
6647 case Intrinsic::aarch64_sme_write_horiz:
6648 case Intrinsic::aarch64_sme_write_vert:
6649 case Intrinsic::aarch64_sme_writeq_horiz:
6650 case Intrinsic::aarch64_sme_writeq_vert: {
6651 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6652 if (!Idx || Idx->getOpcode() != Instruction::Add)
6653 return false;
6654 Ops.push_back(&II->getOperandUse(1));
6655 return true;
6656 }
6657 case Intrinsic::aarch64_sme_read_horiz:
6658 case Intrinsic::aarch64_sme_read_vert:
6659 case Intrinsic::aarch64_sme_readq_horiz:
6660 case Intrinsic::aarch64_sme_readq_vert:
6661 case Intrinsic::aarch64_sme_ld1b_vert:
6662 case Intrinsic::aarch64_sme_ld1h_vert:
6663 case Intrinsic::aarch64_sme_ld1w_vert:
6664 case Intrinsic::aarch64_sme_ld1d_vert:
6665 case Intrinsic::aarch64_sme_ld1q_vert:
6666 case Intrinsic::aarch64_sme_st1b_vert:
6667 case Intrinsic::aarch64_sme_st1h_vert:
6668 case Intrinsic::aarch64_sme_st1w_vert:
6669 case Intrinsic::aarch64_sme_st1d_vert:
6670 case Intrinsic::aarch64_sme_st1q_vert:
6671 case Intrinsic::aarch64_sme_ld1b_horiz:
6672 case Intrinsic::aarch64_sme_ld1h_horiz:
6673 case Intrinsic::aarch64_sme_ld1w_horiz:
6674 case Intrinsic::aarch64_sme_ld1d_horiz:
6675 case Intrinsic::aarch64_sme_ld1q_horiz:
6676 case Intrinsic::aarch64_sme_st1b_horiz:
6677 case Intrinsic::aarch64_sme_st1h_horiz:
6678 case Intrinsic::aarch64_sme_st1w_horiz:
6679 case Intrinsic::aarch64_sme_st1d_horiz:
6680 case Intrinsic::aarch64_sme_st1q_horiz: {
6681 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6682 if (!Idx || Idx->getOpcode() != Instruction::Add)
6683 return false;
6684 Ops.push_back(&II->getOperandUse(3));
6685 return true;
6686 }
6687 case Intrinsic::aarch64_neon_pmull:
6688 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6689 return false;
6690 Ops.push_back(&II->getOperandUse(0));
6691 Ops.push_back(&II->getOperandUse(1));
6692 return true;
6693 case Intrinsic::aarch64_neon_pmull64:
6694 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6695 II->getArgOperand(1)))
6696 return false;
6697 Ops.push_back(&II->getArgOperandUse(0));
6698 Ops.push_back(&II->getArgOperandUse(1));
6699 return true;
6700 case Intrinsic::masked_gather:
6701 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6702 return false;
6703 Ops.push_back(&II->getArgOperandUse(0));
6704 return true;
6705 case Intrinsic::masked_scatter:
6706 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6707 return false;
6708 Ops.push_back(&II->getArgOperandUse(1));
6709 return true;
6710 default:
6711 return false;
6712 }
6713 }
6714
6715 auto ShouldSinkCondition = [](Value *Cond,
6716 SmallVectorImpl<Use *> &Ops) -> bool {
6718 return false;
6720 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6721 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6722 return false;
6723 if (isa<CmpInst>(II->getOperand(0)))
6724 Ops.push_back(&II->getOperandUse(0));
6725 return true;
6726 };
6727
6728 switch (I->getOpcode()) {
6729 case Instruction::GetElementPtr:
6730 case Instruction::Add:
6731 case Instruction::Sub:
6732 // Sink vscales closer to uses for better isel
6733 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6734 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6735 Ops.push_back(&I->getOperandUse(Op));
6736 return true;
6737 }
6738 }
6739 break;
6740 case Instruction::Select: {
6741 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6742 return false;
6743
6744 Ops.push_back(&I->getOperandUse(0));
6745 return true;
6746 }
6747 case Instruction::Br: {
6748 if (cast<BranchInst>(I)->isUnconditional())
6749 return false;
6750
6751 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6752 return false;
6753
6754 Ops.push_back(&I->getOperandUse(0));
6755 return true;
6756 }
6757 case Instruction::FMul:
6758 // fmul with contract flag can be combined with fadd into fma.
6759 // Sinking fneg into this block enables fmls pattern.
6760 if (cast<FPMathOperator>(I)->hasAllowContract()) {
6761 if (isFNeg(I->getOperand(0)))
6762 Ops.push_back(&I->getOperandUse(0));
6763 if (isFNeg(I->getOperand(1)))
6764 Ops.push_back(&I->getOperandUse(1));
6765 }
6766 break;
6767
6768 default:
6769 break;
6770 }
6771
6772 if (!I->getType()->isVectorTy())
6773 return !Ops.empty();
6774
6775 switch (I->getOpcode()) {
6776 case Instruction::Sub:
6777 case Instruction::Add: {
6778 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6779 return false;
6780
6781 // If the exts' operands extract either the lower or upper elements, we
6782 // can sink them too.
6783 auto Ext1 = cast<Instruction>(I->getOperand(0));
6784 auto Ext2 = cast<Instruction>(I->getOperand(1));
6785 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6786 Ops.push_back(&Ext1->getOperandUse(0));
6787 Ops.push_back(&Ext2->getOperandUse(0));
6788 }
6789
6790 Ops.push_back(&I->getOperandUse(0));
6791 Ops.push_back(&I->getOperandUse(1));
6792
6793 return true;
6794 }
6795 case Instruction::Or: {
6796 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6797 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6798 if (ST->hasNEON()) {
6799 Instruction *OtherAnd, *IA, *IB;
6800 Value *MaskValue;
6801 // MainAnd refers to And instruction that has 'Not' as one of its operands
6802 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6803 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6804 m_Instruction(IA)))))) {
6805 if (match(OtherAnd,
6806 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6807 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6808 ? cast<Instruction>(I->getOperand(1))
6809 : cast<Instruction>(I->getOperand(0));
6810
6811 // Both Ands should be in same basic block as Or
6812 if (I->getParent() != MainAnd->getParent() ||
6813 I->getParent() != OtherAnd->getParent())
6814 return false;
6815
6816 // Non-mask operands of both Ands should also be in same basic block
6817 if (I->getParent() != IA->getParent() ||
6818 I->getParent() != IB->getParent())
6819 return false;
6820
6821 Ops.push_back(
6822 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6823 Ops.push_back(&I->getOperandUse(0));
6824 Ops.push_back(&I->getOperandUse(1));
6825
6826 return true;
6827 }
6828 }
6829 }
6830
6831 return false;
6832 }
6833 case Instruction::Mul: {
6834 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6835 auto *Ty = cast<VectorType>(V->getType());
6836 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6837 if (Ty->isScalableTy())
6838 return false;
6839
6840 // Indexed variants of Mul exist for i16 and i32 element types only.
6841 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6842 };
6843
6844 int NumZExts = 0, NumSExts = 0;
6845 for (auto &Op : I->operands()) {
6846 // Make sure we are not already sinking this operand
6847 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6848 continue;
6849
6850 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6851 auto *Ext = cast<Instruction>(Op);
6852 auto *ExtOp = Ext->getOperand(0);
6853 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6854 Ops.push_back(&Ext->getOperandUse(0));
6855 Ops.push_back(&Op);
6856
6857 if (isa<SExtInst>(Ext)) {
6858 NumSExts++;
6859 } else {
6860 NumZExts++;
6861 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6862 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6863 I->getType()->getScalarSizeInBits())
6864 NumSExts++;
6865 }
6866
6867 continue;
6868 }
6869
6871 if (!Shuffle)
6872 continue;
6873
6874 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6875 // operand and the s/zext can help create indexed s/umull. This is
6876 // especially useful to prevent i64 mul being scalarized.
6877 if (isSplatShuffle(Shuffle) &&
6878 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6879 Ops.push_back(&Shuffle->getOperandUse(0));
6880 Ops.push_back(&Op);
6881 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6882 NumSExts++;
6883 else
6884 NumZExts++;
6885 continue;
6886 }
6887
6888 Value *ShuffleOperand = Shuffle->getOperand(0);
6889 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6890 if (!Insert)
6891 continue;
6892
6893 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6894 if (!OperandInstr)
6895 continue;
6896
6897 ConstantInt *ElementConstant =
6898 dyn_cast<ConstantInt>(Insert->getOperand(2));
6899 // Check that the insertelement is inserting into element 0
6900 if (!ElementConstant || !ElementConstant->isZero())
6901 continue;
6902
6903 unsigned Opcode = OperandInstr->getOpcode();
6904 if (Opcode == Instruction::SExt)
6905 NumSExts++;
6906 else if (Opcode == Instruction::ZExt)
6907 NumZExts++;
6908 else {
6909 // If we find that the top bits are known 0, then we can sink and allow
6910 // the backend to generate a umull.
6911 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6912 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6913 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6914 continue;
6915 NumZExts++;
6916 }
6917
6918 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6919 // the And, just to hoist it again back to the load.
6920 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6921 Ops.push_back(&Insert->getOperandUse(1));
6922 Ops.push_back(&Shuffle->getOperandUse(0));
6923 Ops.push_back(&Op);
6924 }
6925
6926 // It is profitable to sink if we found two of the same type of extends.
6927 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6928 return true;
6929
6930 // Otherwise, see if we should sink splats for indexed variants.
6931 if (!ShouldSinkSplatForIndexedVariant(I))
6932 return false;
6933
6934 Ops.clear();
6935 if (isSplatShuffle(I->getOperand(0)))
6936 Ops.push_back(&I->getOperandUse(0));
6937 if (isSplatShuffle(I->getOperand(1)))
6938 Ops.push_back(&I->getOperandUse(1));
6939
6940 return !Ops.empty();
6941 }
6942 case Instruction::FMul: {
6943 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6944 if (I->getType()->isScalableTy())
6945 return !Ops.empty();
6946
6947 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6948 !ST->hasFullFP16())
6949 return !Ops.empty();
6950
6951 // Sink splats for index lane variants
6952 if (isSplatShuffle(I->getOperand(0)))
6953 Ops.push_back(&I->getOperandUse(0));
6954 if (isSplatShuffle(I->getOperand(1)))
6955 Ops.push_back(&I->getOperandUse(1));
6956 return !Ops.empty();
6957 }
6958 default:
6959 return false;
6960 }
6961 return false;
6962}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1679
unsigned countLeadingOnes() const
Definition APInt.h:1633
void negate()
Negate this APInt in place.
Definition APInt.h:1477
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
unsigned logBase2() const
Definition APInt.h:1770
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2553
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2541
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2257
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2465
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2175
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2575
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2248
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2762
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:712
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:879
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:992
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:843
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:849
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:977
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:925
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:958
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:855
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y are an integer type, one is the current recur...
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...