LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 StringRef AttributeStr =
257 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
258 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261 return AArch64::getFMVPriority(Features);
262}
263
265 return F.hasFnAttribute("fmv-features");
266}
267
268const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
269 AArch64::FeatureExecuteOnly,
270};
271
273 const Function *Callee) const {
274 SMECallAttrs CallAttrs(*Caller, *Callee);
275
276 // Never inline a function explicitly marked as being streaming,
277 // into a non-streaming function. Assume it was marked as streaming
278 // for a reason.
279 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
281 return false;
282
283 // When inlining, we should consider the body of the function, not the
284 // interface.
285 if (CallAttrs.callee().hasStreamingBody()) {
286 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
287 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
288 }
289
290 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
291 return false;
292
293 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
294 CallAttrs.requiresPreservingZT0() ||
295 CallAttrs.requiresPreservingAllZAState()) {
296 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
297 return false;
298 }
299
300 const TargetMachine &TM = getTLI()->getTargetMachine();
301 const FeatureBitset &CallerBits =
302 TM.getSubtargetImpl(*Caller)->getFeatureBits();
303 const FeatureBitset &CalleeBits =
304 TM.getSubtargetImpl(*Callee)->getFeatureBits();
305 // Adjust the feature bitsets by inverting some of the bits. This is needed
306 // for target features that represent restrictions rather than capabilities,
307 // for example a "+execute-only" callee can be inlined into a caller without
308 // "+execute-only", but not vice versa.
309 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
310 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
311
312 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
313}
314
316 const Function *Callee,
317 ArrayRef<Type *> Types) const {
318 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
319 return false;
320
321 // We need to ensure that argument promotion does not attempt to promote
322 // pointers to fixed-length vector types larger than 128 bits like
323 // <8 x float> (and pointers to aggregate types which have such fixed-length
324 // vector type members) into the values of the pointees. Such vector types
325 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
326 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
327 // types can be safely treated as 128-bit NEON types and they cannot be
328 // distinguished in IR.
329 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
330 auto FVTy = dyn_cast<FixedVectorType>(Ty);
331 return FVTy &&
332 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
333 }))
334 return false;
335
336 return true;
337}
338
339unsigned
341 unsigned DefaultCallPenalty) const {
342 // This function calculates a penalty for executing Call in F.
343 //
344 // There are two ways this function can be called:
345 // (1) F:
346 // call from F -> G (the call here is Call)
347 //
348 // For (1), Call.getCaller() == F, so it will always return a high cost if
349 // a streaming-mode change is required (thus promoting the need to inline the
350 // function)
351 //
352 // (2) F:
353 // call from F -> G (the call here is not Call)
354 // G:
355 // call from G -> H (the call here is Call)
356 //
357 // For (2), if after inlining the body of G into F the call to H requires a
358 // streaming-mode change, and the call to G from F would also require a
359 // streaming-mode change, then there is benefit to do the streaming-mode
360 // change only once and avoid inlining of G into F.
361
362 SMEAttrs FAttrs(*F);
363 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
364
365 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
366 if (F == Call.getCaller()) // (1)
367 return CallPenaltyChangeSM * DefaultCallPenalty;
368 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
369 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
370 }
371
372 return DefaultCallPenalty;
373}
374
378
379 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
380 return true;
381
383 ST->isSVEorStreamingSVEAvailable() &&
384 !ST->disableMaximizeScalableBandwidth();
385}
386
387/// Calculate the cost of materializing a 64-bit value. This helper
388/// method might only calculate a fraction of a larger immediate. Therefore it
389/// is valid to return a cost of ZERO.
391 // Check if the immediate can be encoded within an instruction.
392 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
393 return 0;
394
395 if (Val < 0)
396 Val = ~Val;
397
398 // Calculate how many moves we will need to materialize this constant.
400 AArch64_IMM::expandMOVImm(Val, 64, Insn);
401 return Insn.size();
402}
403
404/// Calculate the cost of materializing the given constant.
408 assert(Ty->isIntegerTy());
409
410 unsigned BitSize = Ty->getPrimitiveSizeInBits();
411 if (BitSize == 0)
412 return ~0U;
413
414 // Sign-extend all constants to a multiple of 64-bit.
415 APInt ImmVal = Imm;
416 if (BitSize & 0x3f)
417 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
418
419 // Split the constant into 64-bit chunks and calculate the cost for each
420 // chunk.
422 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
423 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
424 int64_t Val = Tmp.getSExtValue();
425 Cost += getIntImmCost(Val);
426 }
427 // We need at least one instruction to materialze the constant.
428 return std::max<InstructionCost>(1, Cost);
429}
430
432 const APInt &Imm, Type *Ty,
434 Instruction *Inst) const {
435 assert(Ty->isIntegerTy());
436
437 unsigned BitSize = Ty->getPrimitiveSizeInBits();
438 // There is no cost model for constants with a bit size of 0. Return TCC_Free
439 // here, so that constant hoisting will ignore this constant.
440 if (BitSize == 0)
441 return TTI::TCC_Free;
442
443 unsigned ImmIdx = ~0U;
444 switch (Opcode) {
445 default:
446 return TTI::TCC_Free;
447 case Instruction::GetElementPtr:
448 // Always hoist the base address of a GetElementPtr.
449 if (Idx == 0)
450 return 2 * TTI::TCC_Basic;
451 return TTI::TCC_Free;
452 case Instruction::Store:
453 ImmIdx = 0;
454 break;
455 case Instruction::Add:
456 case Instruction::Sub:
457 case Instruction::Mul:
458 case Instruction::UDiv:
459 case Instruction::SDiv:
460 case Instruction::URem:
461 case Instruction::SRem:
462 case Instruction::And:
463 case Instruction::Or:
464 case Instruction::Xor:
465 case Instruction::ICmp:
466 ImmIdx = 1;
467 break;
468 // Always return TCC_Free for the shift value of a shift instruction.
469 case Instruction::Shl:
470 case Instruction::LShr:
471 case Instruction::AShr:
472 if (Idx == 1)
473 return TTI::TCC_Free;
474 break;
475 case Instruction::Trunc:
476 case Instruction::ZExt:
477 case Instruction::SExt:
478 case Instruction::IntToPtr:
479 case Instruction::PtrToInt:
480 case Instruction::BitCast:
481 case Instruction::PHI:
482 case Instruction::Call:
483 case Instruction::Select:
484 case Instruction::Ret:
485 case Instruction::Load:
486 break;
487 }
488
489 if (Idx == ImmIdx) {
490 int NumConstants = (BitSize + 63) / 64;
492 return (Cost <= NumConstants * TTI::TCC_Basic)
493 ? static_cast<int>(TTI::TCC_Free)
494 : Cost;
495 }
497}
498
501 const APInt &Imm, Type *Ty,
503 assert(Ty->isIntegerTy());
504
505 unsigned BitSize = Ty->getPrimitiveSizeInBits();
506 // There is no cost model for constants with a bit size of 0. Return TCC_Free
507 // here, so that constant hoisting will ignore this constant.
508 if (BitSize == 0)
509 return TTI::TCC_Free;
510
511 // Most (all?) AArch64 intrinsics do not support folding immediates into the
512 // selected instruction, so we compute the materialization cost for the
513 // immediate directly.
514 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
516
517 switch (IID) {
518 default:
519 return TTI::TCC_Free;
520 case Intrinsic::sadd_with_overflow:
521 case Intrinsic::uadd_with_overflow:
522 case Intrinsic::ssub_with_overflow:
523 case Intrinsic::usub_with_overflow:
524 case Intrinsic::smul_with_overflow:
525 case Intrinsic::umul_with_overflow:
526 if (Idx == 1) {
527 int NumConstants = (BitSize + 63) / 64;
529 return (Cost <= NumConstants * TTI::TCC_Basic)
530 ? static_cast<int>(TTI::TCC_Free)
531 : Cost;
532 }
533 break;
534 case Intrinsic::experimental_stackmap:
535 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 case Intrinsic::experimental_patchpoint_void:
539 case Intrinsic::experimental_patchpoint:
540 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
541 return TTI::TCC_Free;
542 break;
543 case Intrinsic::experimental_gc_statepoint:
544 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
545 return TTI::TCC_Free;
546 break;
547 }
549}
550
552AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
553 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
554 if (TyWidth == 32 || TyWidth == 64)
556 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
557 return TTI::PSK_Software;
558}
559
560static bool isUnpackedVectorVT(EVT VecVT) {
561 return VecVT.isScalableVector() &&
563}
564
566 const IntrinsicCostAttributes &ICA) {
567 // We need to know at least the number of elements in the vector of buckets
568 // and the size of each element to update.
569 if (ICA.getArgTypes().size() < 2)
571
572 // Only interested in costing for the hardware instruction from SVE2.
573 if (!ST->hasSVE2())
575
576 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
577 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
578 unsigned TotalHistCnts = 1;
579
580 unsigned EltSize = EltTy->getScalarSizeInBits();
581 // Only allow (up to 64b) integers or pointers
582 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
584
585 // FIXME: We should be able to generate histcnt for fixed-length vectors
586 // using ptrue with a specific VL.
587 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
588 unsigned EC = VTy->getElementCount().getKnownMinValue();
589 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
591
592 // HistCnt only supports 32b and 64b element types
593 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
594
595 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
597
598 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
599 TotalHistCnts = EC / NaturalVectorWidth;
600
601 return InstructionCost(BaseHistCntCost * TotalHistCnts);
602 }
603
605}
606
610 // The code-generator is currently not able to handle scalable vectors
611 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
612 // it. This change will be removed when code-generation for these types is
613 // sufficiently reliable.
614 auto *RetTy = ICA.getReturnType();
615 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
616 if (VTy->getElementCount() == ElementCount::getScalable(1))
618
619 switch (ICA.getID()) {
620 case Intrinsic::experimental_vector_histogram_add: {
621 InstructionCost HistCost = getHistogramCost(ST, ICA);
622 // If the cost isn't valid, we may still be able to scalarize
623 if (HistCost.isValid())
624 return HistCost;
625 break;
626 }
627 case Intrinsic::umin:
628 case Intrinsic::umax:
629 case Intrinsic::smin:
630 case Intrinsic::smax: {
631 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
632 MVT::v8i16, MVT::v2i32, MVT::v4i32,
633 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
634 MVT::nxv2i64};
635 auto LT = getTypeLegalizationCost(RetTy);
636 // v2i64 types get converted to cmp+bif hence the cost of 2
637 if (LT.second == MVT::v2i64)
638 return LT.first * 2;
639 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
640 return LT.first;
641 break;
642 }
643 case Intrinsic::sadd_sat:
644 case Intrinsic::ssub_sat:
645 case Intrinsic::uadd_sat:
646 case Intrinsic::usub_sat: {
647 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
648 MVT::v8i16, MVT::v2i32, MVT::v4i32,
649 MVT::v2i64};
650 auto LT = getTypeLegalizationCost(RetTy);
651 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
652 // need to extend the type, as it uses shr(qadd(shl, shl)).
653 unsigned Instrs =
654 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
655 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
656 return LT.first * Instrs;
657
659 uint64_t VectorSize = TS.getKnownMinValue();
660
661 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
662 return LT.first * Instrs;
663
664 break;
665 }
666 case Intrinsic::abs: {
667 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
668 MVT::v8i16, MVT::v2i32, MVT::v4i32,
669 MVT::v2i64};
670 auto LT = getTypeLegalizationCost(RetTy);
671 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
672 return LT.first;
673 break;
674 }
675 case Intrinsic::bswap: {
676 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
677 MVT::v4i32, MVT::v2i64};
678 auto LT = getTypeLegalizationCost(RetTy);
679 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
680 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
681 return LT.first;
682 break;
683 }
684 case Intrinsic::fma:
685 case Intrinsic::fmuladd: {
686 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
687 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
688 Type *EltTy = RetTy->getScalarType();
689 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
690 (EltTy->isHalfTy() && ST->hasFullFP16()))
691 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
692 break;
693 }
694 case Intrinsic::stepvector: {
695 InstructionCost Cost = 1; // Cost of the `index' instruction
696 auto LT = getTypeLegalizationCost(RetTy);
697 // Legalisation of illegal vectors involves an `index' instruction plus
698 // (LT.first - 1) vector adds.
699 if (LT.first > 1) {
700 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
701 InstructionCost AddCost =
702 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
703 Cost += AddCost * (LT.first - 1);
704 }
705 return Cost;
706 }
707 case Intrinsic::vector_extract:
708 case Intrinsic::vector_insert: {
709 // If both the vector and subvector types are legal types and the index
710 // is 0, then this should be a no-op or simple operation; return a
711 // relatively low cost.
712
713 // If arguments aren't actually supplied, then we cannot determine the
714 // value of the index. We also want to skip predicate types.
715 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
717 break;
718
719 LLVMContext &C = RetTy->getContext();
720 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
721 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
722 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
723 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
724 // Skip this if either the vector or subvector types are unpacked
725 // SVE types; they may get lowered to stack stores and loads.
726 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
727 break;
728
730 getTLI()->getTypeConversion(C, SubVecVT);
732 getTLI()->getTypeConversion(C, VecVT);
733 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
734 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
735 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
736 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
737 return TTI::TCC_Free;
738 break;
739 }
740 case Intrinsic::bitreverse: {
741 static const CostTblEntry BitreverseTbl[] = {
742 {Intrinsic::bitreverse, MVT::i32, 1},
743 {Intrinsic::bitreverse, MVT::i64, 1},
744 {Intrinsic::bitreverse, MVT::v8i8, 1},
745 {Intrinsic::bitreverse, MVT::v16i8, 1},
746 {Intrinsic::bitreverse, MVT::v4i16, 2},
747 {Intrinsic::bitreverse, MVT::v8i16, 2},
748 {Intrinsic::bitreverse, MVT::v2i32, 2},
749 {Intrinsic::bitreverse, MVT::v4i32, 2},
750 {Intrinsic::bitreverse, MVT::v1i64, 2},
751 {Intrinsic::bitreverse, MVT::v2i64, 2},
752 };
753 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
754 const auto *Entry =
755 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
756 if (Entry) {
757 // Cost Model is using the legal type(i32) that i8 and i16 will be
758 // converted to +1 so that we match the actual lowering cost
759 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
760 TLI->getValueType(DL, RetTy, true) == MVT::i16)
761 return LegalisationCost.first * Entry->Cost + 1;
762
763 return LegalisationCost.first * Entry->Cost;
764 }
765 break;
766 }
767 case Intrinsic::ctpop: {
768 if (!ST->hasNEON()) {
769 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
770 return getTypeLegalizationCost(RetTy).first * 12;
771 }
772 static const CostTblEntry CtpopCostTbl[] = {
773 {ISD::CTPOP, MVT::v2i64, 4},
774 {ISD::CTPOP, MVT::v4i32, 3},
775 {ISD::CTPOP, MVT::v8i16, 2},
776 {ISD::CTPOP, MVT::v16i8, 1},
777 {ISD::CTPOP, MVT::i64, 4},
778 {ISD::CTPOP, MVT::v2i32, 3},
779 {ISD::CTPOP, MVT::v4i16, 2},
780 {ISD::CTPOP, MVT::v8i8, 1},
781 {ISD::CTPOP, MVT::i32, 5},
782 };
783 auto LT = getTypeLegalizationCost(RetTy);
784 MVT MTy = LT.second;
785 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
786 // Extra cost of +1 when illegal vector types are legalized by promoting
787 // the integer type.
788 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
789 RetTy->getScalarSizeInBits()
790 ? 1
791 : 0;
792 return LT.first * Entry->Cost + ExtraCost;
793 }
794 break;
795 }
796 case Intrinsic::sadd_with_overflow:
797 case Intrinsic::uadd_with_overflow:
798 case Intrinsic::ssub_with_overflow:
799 case Intrinsic::usub_with_overflow:
800 case Intrinsic::smul_with_overflow:
801 case Intrinsic::umul_with_overflow: {
802 static const CostTblEntry WithOverflowCostTbl[] = {
803 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
804 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
805 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
806 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
807 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
808 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
809 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
810 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
811 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
812 {Intrinsic::usub_with_overflow, MVT::i8, 3},
813 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
814 {Intrinsic::usub_with_overflow, MVT::i16, 3},
815 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
816 {Intrinsic::usub_with_overflow, MVT::i32, 1},
817 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
818 {Intrinsic::usub_with_overflow, MVT::i64, 1},
819 {Intrinsic::smul_with_overflow, MVT::i8, 5},
820 {Intrinsic::umul_with_overflow, MVT::i8, 4},
821 {Intrinsic::smul_with_overflow, MVT::i16, 5},
822 {Intrinsic::umul_with_overflow, MVT::i16, 4},
823 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
824 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
825 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
826 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
827 };
828 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
829 if (MTy.isSimple())
830 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
831 MTy.getSimpleVT()))
832 return Entry->Cost;
833 break;
834 }
835 case Intrinsic::fptosi_sat:
836 case Intrinsic::fptoui_sat: {
837 if (ICA.getArgTypes().empty())
838 break;
839 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
840 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
841 EVT MTy = TLI->getValueType(DL, RetTy);
842 // Check for the legal types, which are where the size of the input and the
843 // output are the same, or we are using cvt f64->i32 or f32->i64.
844 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
845 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
846 LT.second == MVT::v2f64)) {
847 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
848 (LT.second == MVT::f64 && MTy == MVT::i32) ||
849 (LT.second == MVT::f32 && MTy == MVT::i64)))
850 return LT.first;
851 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
852 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
853 MTy.getScalarSizeInBits() == 64)
854 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
855 }
856 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
857 // f32.
858 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
859 return LT.first + getIntrinsicInstrCost(
860 {ICA.getID(),
861 RetTy,
862 {ICA.getArgTypes()[0]->getWithNewType(
863 Type::getFloatTy(RetTy->getContext()))}},
864 CostKind);
865 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
866 (LT.second == MVT::f16 && MTy == MVT::i64) ||
867 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
868 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
869 return LT.first;
870 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
871 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
872 MTy.getScalarSizeInBits() == 32)
873 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
874 // Extending vector types v8f16->v8i32. These current scalarize but the
875 // codegen could be better.
876 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
877 MTy.getScalarSizeInBits() == 64)
878 return MTy.getVectorNumElements() * 3;
879
880 // If we can we use a legal convert followed by a min+max
881 if ((LT.second.getScalarType() == MVT::f32 ||
882 LT.second.getScalarType() == MVT::f64 ||
883 LT.second.getScalarType() == MVT::f16) &&
884 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
885 Type *LegalTy =
886 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
887 if (LT.second.isVector())
888 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
890 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
891 LegalTy, {LegalTy, LegalTy});
893 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
894 LegalTy, {LegalTy, LegalTy});
896 return LT.first * Cost +
897 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
898 : 1);
899 }
900 // Otherwise we need to follow the default expansion that clamps the value
901 // using a float min/max with a fcmp+sel for nan handling when signed.
902 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
903 RetTy = RetTy->getScalarType();
904 if (LT.second.isVector()) {
905 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
906 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
907 }
908 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
910 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
912 Cost +=
913 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
915 if (IsSigned) {
916 Type *CondTy = RetTy->getWithNewBitWidth(1);
917 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
919 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
921 }
922 return LT.first * Cost;
923 }
924 case Intrinsic::fshl:
925 case Intrinsic::fshr: {
926 if (ICA.getArgs().empty())
927 break;
928
929 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
930
931 // ROTR / ROTL is a funnel shift with equal first and second operand. For
932 // ROTR on integer registers (i32/i64) this can be done in a single ror
933 // instruction. A fshl with a non-constant shift uses a neg + ror.
934 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
935 (RetTy->getPrimitiveSizeInBits() == 32 ||
936 RetTy->getPrimitiveSizeInBits() == 64)) {
937 InstructionCost NegCost =
938 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
939 return 1 + NegCost;
940 }
941
942 // TODO: Add handling for fshl where third argument is not a constant.
943 if (!OpInfoZ.isConstant())
944 break;
945
946 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
947 if (OpInfoZ.isUniform()) {
948 static const CostTblEntry FshlTbl[] = {
949 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
950 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
951 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
952 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
953 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
954 // to avoid having to duplicate the costs.
955 const auto *Entry =
956 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
957 if (Entry)
958 return LegalisationCost.first * Entry->Cost;
959 }
960
961 auto TyL = getTypeLegalizationCost(RetTy);
962 if (!RetTy->isIntegerTy())
963 break;
964
965 // Estimate cost manually, as types like i8 and i16 will get promoted to
966 // i32 and CostTableLookup will ignore the extra conversion cost.
967 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
968 RetTy->getScalarSizeInBits() < 64) ||
969 (RetTy->getScalarSizeInBits() % 64 != 0);
970 unsigned ExtraCost = HigherCost ? 1 : 0;
971 if (RetTy->getScalarSizeInBits() == 32 ||
972 RetTy->getScalarSizeInBits() == 64)
973 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
974 // extr instruction.
975 else if (HigherCost)
976 ExtraCost = 1;
977 else
978 break;
979 return TyL.first + ExtraCost;
980 }
981 case Intrinsic::get_active_lane_mask: {
982 auto RetTy = cast<VectorType>(ICA.getReturnType());
983 EVT RetVT = getTLI()->getValueType(DL, RetTy);
984 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
985 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
986 break;
987
988 if (RetTy->isScalableTy()) {
989 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
991 break;
992
993 auto LT = getTypeLegalizationCost(RetTy);
994 InstructionCost Cost = LT.first;
995 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
996 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
997 // nxv32i1 = get_active_lane_mask(base, idx) ->
998 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
999 if (ST->hasSVE2p1() || ST->hasSME2()) {
1000 Cost /= 2;
1001 if (Cost == 1)
1002 return Cost;
1003 }
1004
1005 // If more than one whilelo intrinsic is required, include the extra cost
1006 // required by the saturating add & select required to increment the
1007 // start value after the first intrinsic call.
1008 Type *OpTy = ICA.getArgTypes()[0];
1009 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1010 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1011 Type *CondTy = OpTy->getWithNewBitWidth(1);
1012 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1014 return Cost + (SplitCost * (Cost - 1));
1015 } else if (!getTLI()->isTypeLegal(RetVT)) {
1016 // We don't have enough context at this point to determine if the mask
1017 // is going to be kept live after the block, which will force the vXi1
1018 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1019 // For now, we just assume the vectorizer created this intrinsic and
1020 // the result will be the input for a PHI. In this case the cost will
1021 // be extremely high for fixed-width vectors.
1022 // NOTE: getScalarizationOverhead returns a cost that's far too
1023 // pessimistic for the actual generated codegen. In reality there are
1024 // two instructions generated per lane.
1025 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1026 }
1027 break;
1028 }
1029 case Intrinsic::experimental_vector_match: {
1030 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1031 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1032 unsigned SearchSize = NeedleTy->getNumElements();
1033 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1034 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1035 // Neoverse V3, these are cheap operations with the same latency as a
1036 // vector ADD. In most cases, however, we also need to do an extra DUP.
1037 // For fixed-length vectors we currently need an extra five--six
1038 // instructions besides the MATCH.
1040 if (isa<FixedVectorType>(RetTy))
1041 Cost += 10;
1042 return Cost;
1043 }
1044 break;
1045 }
1046 case Intrinsic::experimental_cttz_elts: {
1047 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1048 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1049 // This will consist of a SVE brkb and a cntp instruction. These
1050 // typically have the same latency and half the throughput as a vector
1051 // add instruction.
1052 return 4;
1053 }
1054 break;
1055 }
1056 case Intrinsic::experimental_vector_extract_last_active:
1057 if (ST->isSVEorStreamingSVEAvailable()) {
1058 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1059 // This should turn into chained clastb instructions.
1060 return LegalCost;
1061 }
1062 break;
1063 default:
1064 break;
1065 }
1067}
1068
1069/// The function will remove redundant reinterprets casting in the presence
1070/// of the control flow
1071static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1072 IntrinsicInst &II) {
1074 auto RequiredType = II.getType();
1075
1076 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1077 assert(PN && "Expected Phi Node!");
1078
1079 // Don't create a new Phi unless we can remove the old one.
1080 if (!PN->hasOneUse())
1081 return std::nullopt;
1082
1083 for (Value *IncValPhi : PN->incoming_values()) {
1084 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1085 if (!Reinterpret ||
1086 Reinterpret->getIntrinsicID() !=
1087 Intrinsic::aarch64_sve_convert_to_svbool ||
1088 RequiredType != Reinterpret->getArgOperand(0)->getType())
1089 return std::nullopt;
1090 }
1091
1092 // Create the new Phi
1093 IC.Builder.SetInsertPoint(PN);
1094 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1095 Worklist.push_back(PN);
1096
1097 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1098 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1099 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1100 Worklist.push_back(Reinterpret);
1101 }
1102
1103 // Cleanup Phi Node and reinterprets
1104 return IC.replaceInstUsesWith(II, NPN);
1105}
1106
1107// A collection of properties common to SVE intrinsics that allow for combines
1108// to be written without needing to know the specific intrinsic.
1110 //
1111 // Helper routines for common intrinsic definitions.
1112 //
1113
1114 // e.g. llvm.aarch64.sve.add pg, op1, op2
1115 // with IID ==> llvm.aarch64.sve.add_u
1116 static SVEIntrinsicInfo
1123
1124 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1131
1132 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1138
1139 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1145
1146 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1147 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1148 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1149 return SVEIntrinsicInfo()
1152 }
1153
1154 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1155 // llvm.aarch64.sve.ld1 pg, ptr
1162
1163 // All properties relate to predication and thus having a general predicate
1164 // is the minimum requirement to say there is intrinsic info to act on.
1165 explicit operator bool() const { return hasGoverningPredicate(); }
1166
1167 //
1168 // Properties relating to the governing predicate.
1169 //
1170
1172 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1173 }
1174
1176 assert(hasGoverningPredicate() && "Propery not set!");
1177 return GoverningPredicateIdx;
1178 }
1179
1181 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1182 GoverningPredicateIdx = Index;
1183 return *this;
1184 }
1185
1186 //
1187 // Properties relating to operations the intrinsic could be transformed into.
1188 // NOTE: This does not mean such a transformation is always possible, but the
1189 // knowledge makes it possible to reuse existing optimisations without needing
1190 // to embed specific handling for each intrinsic. For example, instruction
1191 // simplification can be used to optimise an intrinsic's active lanes.
1192 //
1193
1195 return UndefIntrinsic != Intrinsic::not_intrinsic;
1196 }
1197
1199 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1200 return UndefIntrinsic;
1201 }
1202
1204 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1205 UndefIntrinsic = IID;
1206 return *this;
1207 }
1208
1209 bool hasMatchingIROpode() const { return IROpcode != 0; }
1210
1211 unsigned getMatchingIROpode() const {
1212 assert(hasMatchingIROpode() && "Propery not set!");
1213 return IROpcode;
1214 }
1215
1217 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1218 IROpcode = Opcode;
1219 return *this;
1220 }
1221
1222 //
1223 // Properties relating to the result of inactive lanes.
1224 //
1225
1227 return ResultLanes == InactiveLanesTakenFromOperand;
1228 }
1229
1231 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1232 return OperandIdxForInactiveLanes;
1233 }
1234
1236 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1237 ResultLanes = InactiveLanesTakenFromOperand;
1238 OperandIdxForInactiveLanes = Index;
1239 return *this;
1240 }
1241
1243 return ResultLanes == InactiveLanesAreNotDefined;
1244 }
1245
1247 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1248 ResultLanes = InactiveLanesAreNotDefined;
1249 return *this;
1250 }
1251
1253 return ResultLanes == InactiveLanesAreUnused;
1254 }
1255
1257 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1258 ResultLanes = InactiveLanesAreUnused;
1259 return *this;
1260 }
1261
1262 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1263 // inactiveLanesAreZeroed =
1264 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1265 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1266
1268 ResultIsZeroInitialized = true;
1269 return *this;
1270 }
1271
1272 //
1273 // The first operand of unary merging operations is typically only used to
1274 // set the result for inactive lanes. Knowing this allows us to deadcode the
1275 // operand when we can prove there are no inactive lanes.
1276 //
1277
1279 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1280 }
1281
1283 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1284 return OperandIdxWithNoActiveLanes;
1285 }
1286
1288 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1289 OperandIdxWithNoActiveLanes = Index;
1290 return *this;
1291 }
1292
1293private:
1294 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1295
1296 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1297 unsigned IROpcode = 0;
1298
1299 enum PredicationStyle {
1301 InactiveLanesTakenFromOperand,
1302 InactiveLanesAreNotDefined,
1303 InactiveLanesAreUnused
1304 } ResultLanes = Uninitialized;
1305
1306 bool ResultIsZeroInitialized = false;
1307 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1308 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1309};
1310
1312 // Some SVE intrinsics do not use scalable vector types, but since they are
1313 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1314 if (!isa<ScalableVectorType>(II.getType()) &&
1315 all_of(II.args(), [&](const Value *V) {
1316 return !isa<ScalableVectorType>(V->getType());
1317 }))
1318 return SVEIntrinsicInfo();
1319
1320 Intrinsic::ID IID = II.getIntrinsicID();
1321 switch (IID) {
1322 default:
1323 break;
1324 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1325 case Intrinsic::aarch64_sve_fcvt_f16f32:
1326 case Intrinsic::aarch64_sve_fcvt_f16f64:
1327 case Intrinsic::aarch64_sve_fcvt_f32f16:
1328 case Intrinsic::aarch64_sve_fcvt_f32f64:
1329 case Intrinsic::aarch64_sve_fcvt_f64f16:
1330 case Intrinsic::aarch64_sve_fcvt_f64f32:
1331 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1332 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1333 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1334 case Intrinsic::aarch64_sve_fcvtzs:
1335 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1336 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1337 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1338 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1339 case Intrinsic::aarch64_sve_fcvtzu:
1340 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1341 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1342 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1343 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1344 case Intrinsic::aarch64_sve_scvtf:
1345 case Intrinsic::aarch64_sve_scvtf_f16i32:
1346 case Intrinsic::aarch64_sve_scvtf_f16i64:
1347 case Intrinsic::aarch64_sve_scvtf_f32i64:
1348 case Intrinsic::aarch64_sve_scvtf_f64i32:
1349 case Intrinsic::aarch64_sve_ucvtf:
1350 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1351 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1352 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1353 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1355
1356 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1357 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1358 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1359 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1361
1362 case Intrinsic::aarch64_sve_fabd:
1363 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1364 case Intrinsic::aarch64_sve_fadd:
1365 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1366 .setMatchingIROpcode(Instruction::FAdd);
1367 case Intrinsic::aarch64_sve_fdiv:
1368 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1369 .setMatchingIROpcode(Instruction::FDiv);
1370 case Intrinsic::aarch64_sve_fmax:
1371 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1372 case Intrinsic::aarch64_sve_fmaxnm:
1373 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1374 case Intrinsic::aarch64_sve_fmin:
1375 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1376 case Intrinsic::aarch64_sve_fminnm:
1377 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1378 case Intrinsic::aarch64_sve_fmla:
1379 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1380 case Intrinsic::aarch64_sve_fmls:
1381 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1382 case Intrinsic::aarch64_sve_fmul:
1383 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1384 .setMatchingIROpcode(Instruction::FMul);
1385 case Intrinsic::aarch64_sve_fmulx:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1387 case Intrinsic::aarch64_sve_fnmla:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1389 case Intrinsic::aarch64_sve_fnmls:
1390 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1391 case Intrinsic::aarch64_sve_fsub:
1392 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1393 .setMatchingIROpcode(Instruction::FSub);
1394 case Intrinsic::aarch64_sve_add:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1396 .setMatchingIROpcode(Instruction::Add);
1397 case Intrinsic::aarch64_sve_mla:
1398 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1399 case Intrinsic::aarch64_sve_mls:
1400 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1401 case Intrinsic::aarch64_sve_mul:
1402 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1403 .setMatchingIROpcode(Instruction::Mul);
1404 case Intrinsic::aarch64_sve_sabd:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1406 case Intrinsic::aarch64_sve_sdiv:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1408 .setMatchingIROpcode(Instruction::SDiv);
1409 case Intrinsic::aarch64_sve_smax:
1410 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1411 case Intrinsic::aarch64_sve_smin:
1412 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1413 case Intrinsic::aarch64_sve_smulh:
1414 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1415 case Intrinsic::aarch64_sve_sub:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1417 .setMatchingIROpcode(Instruction::Sub);
1418 case Intrinsic::aarch64_sve_uabd:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1420 case Intrinsic::aarch64_sve_udiv:
1421 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1422 .setMatchingIROpcode(Instruction::UDiv);
1423 case Intrinsic::aarch64_sve_umax:
1424 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1425 case Intrinsic::aarch64_sve_umin:
1426 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1427 case Intrinsic::aarch64_sve_umulh:
1428 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1429 case Intrinsic::aarch64_sve_asr:
1430 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1431 .setMatchingIROpcode(Instruction::AShr);
1432 case Intrinsic::aarch64_sve_lsl:
1433 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1434 .setMatchingIROpcode(Instruction::Shl);
1435 case Intrinsic::aarch64_sve_lsr:
1436 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1437 .setMatchingIROpcode(Instruction::LShr);
1438 case Intrinsic::aarch64_sve_and:
1439 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1440 .setMatchingIROpcode(Instruction::And);
1441 case Intrinsic::aarch64_sve_bic:
1442 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1443 case Intrinsic::aarch64_sve_eor:
1444 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1445 .setMatchingIROpcode(Instruction::Xor);
1446 case Intrinsic::aarch64_sve_orr:
1447 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1448 .setMatchingIROpcode(Instruction::Or);
1449 case Intrinsic::aarch64_sve_sqrshl:
1450 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1451 case Intrinsic::aarch64_sve_sqshl:
1452 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1453 case Intrinsic::aarch64_sve_sqsub:
1454 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1455 case Intrinsic::aarch64_sve_srshl:
1456 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1457 case Intrinsic::aarch64_sve_uqrshl:
1458 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1459 case Intrinsic::aarch64_sve_uqshl:
1460 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1461 case Intrinsic::aarch64_sve_uqsub:
1462 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1463 case Intrinsic::aarch64_sve_urshl:
1464 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1465
1466 case Intrinsic::aarch64_sve_add_u:
1468 Instruction::Add);
1469 case Intrinsic::aarch64_sve_and_u:
1471 Instruction::And);
1472 case Intrinsic::aarch64_sve_asr_u:
1474 Instruction::AShr);
1475 case Intrinsic::aarch64_sve_eor_u:
1477 Instruction::Xor);
1478 case Intrinsic::aarch64_sve_fadd_u:
1480 Instruction::FAdd);
1481 case Intrinsic::aarch64_sve_fdiv_u:
1483 Instruction::FDiv);
1484 case Intrinsic::aarch64_sve_fmul_u:
1486 Instruction::FMul);
1487 case Intrinsic::aarch64_sve_fsub_u:
1489 Instruction::FSub);
1490 case Intrinsic::aarch64_sve_lsl_u:
1492 Instruction::Shl);
1493 case Intrinsic::aarch64_sve_lsr_u:
1495 Instruction::LShr);
1496 case Intrinsic::aarch64_sve_mul_u:
1498 Instruction::Mul);
1499 case Intrinsic::aarch64_sve_orr_u:
1501 Instruction::Or);
1502 case Intrinsic::aarch64_sve_sdiv_u:
1504 Instruction::SDiv);
1505 case Intrinsic::aarch64_sve_sub_u:
1507 Instruction::Sub);
1508 case Intrinsic::aarch64_sve_udiv_u:
1510 Instruction::UDiv);
1511
1512 case Intrinsic::aarch64_sve_addqv:
1513 case Intrinsic::aarch64_sve_and_z:
1514 case Intrinsic::aarch64_sve_bic_z:
1515 case Intrinsic::aarch64_sve_brka_z:
1516 case Intrinsic::aarch64_sve_brkb_z:
1517 case Intrinsic::aarch64_sve_brkn_z:
1518 case Intrinsic::aarch64_sve_brkpa_z:
1519 case Intrinsic::aarch64_sve_brkpb_z:
1520 case Intrinsic::aarch64_sve_cntp:
1521 case Intrinsic::aarch64_sve_compact:
1522 case Intrinsic::aarch64_sve_eor_z:
1523 case Intrinsic::aarch64_sve_eorv:
1524 case Intrinsic::aarch64_sve_eorqv:
1525 case Intrinsic::aarch64_sve_nand_z:
1526 case Intrinsic::aarch64_sve_nor_z:
1527 case Intrinsic::aarch64_sve_orn_z:
1528 case Intrinsic::aarch64_sve_orr_z:
1529 case Intrinsic::aarch64_sve_orv:
1530 case Intrinsic::aarch64_sve_orqv:
1531 case Intrinsic::aarch64_sve_pnext:
1532 case Intrinsic::aarch64_sve_rdffr_z:
1533 case Intrinsic::aarch64_sve_saddv:
1534 case Intrinsic::aarch64_sve_uaddv:
1535 case Intrinsic::aarch64_sve_umaxv:
1536 case Intrinsic::aarch64_sve_umaxqv:
1537 case Intrinsic::aarch64_sve_cmpeq:
1538 case Intrinsic::aarch64_sve_cmpeq_wide:
1539 case Intrinsic::aarch64_sve_cmpge:
1540 case Intrinsic::aarch64_sve_cmpge_wide:
1541 case Intrinsic::aarch64_sve_cmpgt:
1542 case Intrinsic::aarch64_sve_cmpgt_wide:
1543 case Intrinsic::aarch64_sve_cmphi:
1544 case Intrinsic::aarch64_sve_cmphi_wide:
1545 case Intrinsic::aarch64_sve_cmphs:
1546 case Intrinsic::aarch64_sve_cmphs_wide:
1547 case Intrinsic::aarch64_sve_cmple_wide:
1548 case Intrinsic::aarch64_sve_cmplo_wide:
1549 case Intrinsic::aarch64_sve_cmpls_wide:
1550 case Intrinsic::aarch64_sve_cmplt_wide:
1551 case Intrinsic::aarch64_sve_cmpne:
1552 case Intrinsic::aarch64_sve_cmpne_wide:
1553 case Intrinsic::aarch64_sve_facge:
1554 case Intrinsic::aarch64_sve_facgt:
1555 case Intrinsic::aarch64_sve_fcmpeq:
1556 case Intrinsic::aarch64_sve_fcmpge:
1557 case Intrinsic::aarch64_sve_fcmpgt:
1558 case Intrinsic::aarch64_sve_fcmpne:
1559 case Intrinsic::aarch64_sve_fcmpuo:
1560 case Intrinsic::aarch64_sve_ld1:
1561 case Intrinsic::aarch64_sve_ld1_gather:
1562 case Intrinsic::aarch64_sve_ld1_gather_index:
1563 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1564 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1565 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1566 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1567 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1568 case Intrinsic::aarch64_sve_ld1q_gather_index:
1569 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1570 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1571 case Intrinsic::aarch64_sve_ld1ro:
1572 case Intrinsic::aarch64_sve_ld1rq:
1573 case Intrinsic::aarch64_sve_ld1udq:
1574 case Intrinsic::aarch64_sve_ld1uwq:
1575 case Intrinsic::aarch64_sve_ld2_sret:
1576 case Intrinsic::aarch64_sve_ld2q_sret:
1577 case Intrinsic::aarch64_sve_ld3_sret:
1578 case Intrinsic::aarch64_sve_ld3q_sret:
1579 case Intrinsic::aarch64_sve_ld4_sret:
1580 case Intrinsic::aarch64_sve_ld4q_sret:
1581 case Intrinsic::aarch64_sve_ldff1:
1582 case Intrinsic::aarch64_sve_ldff1_gather:
1583 case Intrinsic::aarch64_sve_ldff1_gather_index:
1584 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1585 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1586 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1587 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1588 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1589 case Intrinsic::aarch64_sve_ldnf1:
1590 case Intrinsic::aarch64_sve_ldnt1:
1591 case Intrinsic::aarch64_sve_ldnt1_gather:
1592 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1593 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1594 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1596
1597 case Intrinsic::aarch64_sve_prf:
1598 case Intrinsic::aarch64_sve_prfb_gather_index:
1599 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1600 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1601 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1602 case Intrinsic::aarch64_sve_prfd_gather_index:
1603 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1604 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1605 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1606 case Intrinsic::aarch64_sve_prfh_gather_index:
1607 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1608 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1609 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1610 case Intrinsic::aarch64_sve_prfw_gather_index:
1611 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1612 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1613 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1615
1616 case Intrinsic::aarch64_sve_st1_scatter:
1617 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1618 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1619 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1620 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1621 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1622 case Intrinsic::aarch64_sve_st1dq:
1623 case Intrinsic::aarch64_sve_st1q_scatter_index:
1624 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1625 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1626 case Intrinsic::aarch64_sve_st1wq:
1627 case Intrinsic::aarch64_sve_stnt1:
1628 case Intrinsic::aarch64_sve_stnt1_scatter:
1629 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1630 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1631 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1633 case Intrinsic::aarch64_sve_st2:
1634 case Intrinsic::aarch64_sve_st2q:
1636 case Intrinsic::aarch64_sve_st3:
1637 case Intrinsic::aarch64_sve_st3q:
1639 case Intrinsic::aarch64_sve_st4:
1640 case Intrinsic::aarch64_sve_st4q:
1642 }
1643
1644 return SVEIntrinsicInfo();
1645}
1646
1647static bool isAllActivePredicate(Value *Pred) {
1648 Value *UncastedPred;
1649
1650 // Look through predicate casts that only remove lanes.
1652 m_Value(UncastedPred)))) {
1653 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1654 Pred = UncastedPred;
1655
1657 m_Value(UncastedPred))))
1658 // If the predicate has the same or less lanes than the uncasted predicate
1659 // then we know the casting has no effect.
1660 if (OrigPredTy->getMinNumElements() <=
1661 cast<ScalableVectorType>(UncastedPred->getType())
1662 ->getMinNumElements())
1663 Pred = UncastedPred;
1664 }
1665
1666 auto *C = dyn_cast<Constant>(Pred);
1667 return C && C->isAllOnesValue();
1668}
1669
1670// Simplify `V` by only considering the operations that affect active lanes.
1671// This function should only return existing Values or newly created Constants.
1672static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1673 auto *Dup = dyn_cast<IntrinsicInst>(V);
1674 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1675 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1677 cast<VectorType>(V->getType())->getElementCount(),
1678 cast<Constant>(Dup->getOperand(2)));
1679
1680 return V;
1681}
1682
1683static std::optional<Instruction *>
1685 const SVEIntrinsicInfo &IInfo) {
1686 const unsigned Opc = IInfo.getMatchingIROpode();
1687 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1688
1689 Value *Pg = II.getOperand(0);
1690 Value *Op1 = II.getOperand(1);
1691 Value *Op2 = II.getOperand(2);
1692 const DataLayout &DL = II.getDataLayout();
1693
1694 // Canonicalise constants to the RHS.
1696 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1697 IC.replaceOperand(II, 1, Op2);
1698 IC.replaceOperand(II, 2, Op1);
1699 return &II;
1700 }
1701
1702 // Only active lanes matter when simplifying the operation.
1703 Op1 = stripInactiveLanes(Op1, Pg);
1704 Op2 = stripInactiveLanes(Op2, Pg);
1705
1706 Value *SimpleII;
1707 if (auto FII = dyn_cast<FPMathOperator>(&II))
1708 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1709 else
1710 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1711
1712 // An SVE intrinsic's result is always defined. However, this is not the case
1713 // for its equivalent IR instruction (e.g. when shifting by an amount more
1714 // than the data's bitwidth). Simplifications to an undefined result must be
1715 // ignored to preserve the intrinsic's expected behaviour.
1716 if (!SimpleII || isa<UndefValue>(SimpleII))
1717 return std::nullopt;
1718
1719 if (IInfo.inactiveLanesAreNotDefined())
1720 return IC.replaceInstUsesWith(II, SimpleII);
1721
1722 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1723
1724 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1725 if (SimpleII == Inactive)
1726 return IC.replaceInstUsesWith(II, SimpleII);
1727
1728 // Inactive lanes must be preserved.
1729 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1730 return IC.replaceInstUsesWith(II, SimpleII);
1731}
1732
1733// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1734// to operations with less strict inactive lane requirements.
1735static std::optional<Instruction *>
1737 const SVEIntrinsicInfo &IInfo) {
1738 if (!IInfo.hasGoverningPredicate())
1739 return std::nullopt;
1740
1741 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1742
1743 // If there are no active lanes.
1744 if (match(OpPredicate, m_ZeroInt())) {
1746 return IC.replaceInstUsesWith(
1747 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1748
1749 if (IInfo.inactiveLanesAreUnused()) {
1750 if (IInfo.resultIsZeroInitialized())
1752
1753 return IC.eraseInstFromFunction(II);
1754 }
1755 }
1756
1757 // If there are no inactive lanes.
1758 if (isAllActivePredicate(OpPredicate)) {
1759 if (IInfo.hasOperandWithNoActiveLanes()) {
1760 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1761 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1762 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1763 }
1764
1765 if (IInfo.hasMatchingUndefIntrinsic()) {
1766 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1767 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1768 II.setCalledFunction(NewDecl);
1769 return &II;
1770 }
1771 }
1772
1773 // Operation specific simplifications.
1774 if (IInfo.hasMatchingIROpode() &&
1776 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1777
1778 return std::nullopt;
1779}
1780
1781// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1782// => (binop (pred) (from_svbool _) (from_svbool _))
1783//
1784// The above transformation eliminates a `to_svbool` in the predicate
1785// operand of bitwise operation `binop` by narrowing the vector width of
1786// the operation. For example, it would convert a `<vscale x 16 x i1>
1787// and` into a `<vscale x 4 x i1> and`. This is profitable because
1788// to_svbool must zero the new lanes during widening, whereas
1789// from_svbool is free.
1790static std::optional<Instruction *>
1792 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1793 if (!BinOp)
1794 return std::nullopt;
1795
1796 auto IntrinsicID = BinOp->getIntrinsicID();
1797 switch (IntrinsicID) {
1798 case Intrinsic::aarch64_sve_and_z:
1799 case Intrinsic::aarch64_sve_bic_z:
1800 case Intrinsic::aarch64_sve_eor_z:
1801 case Intrinsic::aarch64_sve_nand_z:
1802 case Intrinsic::aarch64_sve_nor_z:
1803 case Intrinsic::aarch64_sve_orn_z:
1804 case Intrinsic::aarch64_sve_orr_z:
1805 break;
1806 default:
1807 return std::nullopt;
1808 }
1809
1810 auto BinOpPred = BinOp->getOperand(0);
1811 auto BinOpOp1 = BinOp->getOperand(1);
1812 auto BinOpOp2 = BinOp->getOperand(2);
1813
1814 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1815 if (!PredIntr ||
1816 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1817 return std::nullopt;
1818
1819 auto PredOp = PredIntr->getOperand(0);
1820 auto PredOpTy = cast<VectorType>(PredOp->getType());
1821 if (PredOpTy != II.getType())
1822 return std::nullopt;
1823
1824 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1825 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1826 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1827 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1828 if (BinOpOp1 == BinOpOp2)
1829 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1830 else
1831 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1832 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1833
1834 auto NarrowedBinOp =
1835 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1836 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1837}
1838
1839static std::optional<Instruction *>
1841 // If the reinterpret instruction operand is a PHI Node
1842 if (isa<PHINode>(II.getArgOperand(0)))
1843 return processPhiNode(IC, II);
1844
1845 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1846 return BinOpCombine;
1847
1848 // Ignore converts to/from svcount_t.
1849 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1850 isa<TargetExtType>(II.getType()))
1851 return std::nullopt;
1852
1853 SmallVector<Instruction *, 32> CandidatesForRemoval;
1854 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1855
1856 const auto *IVTy = cast<VectorType>(II.getType());
1857
1858 // Walk the chain of conversions.
1859 while (Cursor) {
1860 // If the type of the cursor has fewer lanes than the final result, zeroing
1861 // must take place, which breaks the equivalence chain.
1862 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1863 if (CursorVTy->getElementCount().getKnownMinValue() <
1864 IVTy->getElementCount().getKnownMinValue())
1865 break;
1866
1867 // If the cursor has the same type as I, it is a viable replacement.
1868 if (Cursor->getType() == IVTy)
1869 EarliestReplacement = Cursor;
1870
1871 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1872
1873 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1874 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1875 Intrinsic::aarch64_sve_convert_to_svbool ||
1876 IntrinsicCursor->getIntrinsicID() ==
1877 Intrinsic::aarch64_sve_convert_from_svbool))
1878 break;
1879
1880 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1881 Cursor = IntrinsicCursor->getOperand(0);
1882 }
1883
1884 // If no viable replacement in the conversion chain was found, there is
1885 // nothing to do.
1886 if (!EarliestReplacement)
1887 return std::nullopt;
1888
1889 return IC.replaceInstUsesWith(II, EarliestReplacement);
1890}
1891
1892static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1893 IntrinsicInst &II) {
1894 // svsel(ptrue, x, y) => x
1895 auto *OpPredicate = II.getOperand(0);
1896 if (isAllActivePredicate(OpPredicate))
1897 return IC.replaceInstUsesWith(II, II.getOperand(1));
1898
1899 auto Select =
1900 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1901 return IC.replaceInstUsesWith(II, Select);
1902}
1903
1904static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1905 IntrinsicInst &II) {
1906 Value *Pg = II.getOperand(1);
1907
1908 // sve.dup(V, all_active, X) ==> splat(X)
1909 if (isAllActivePredicate(Pg)) {
1910 auto *RetTy = cast<ScalableVectorType>(II.getType());
1911 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1912 II.getArgOperand(2));
1913 return IC.replaceInstUsesWith(II, Splat);
1914 }
1915
1917 m_SpecificInt(AArch64SVEPredPattern::vl1))))
1918 return std::nullopt;
1919
1920 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1921 Value *Insert = IC.Builder.CreateInsertElement(
1922 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
1923 return IC.replaceInstUsesWith(II, Insert);
1924}
1925
1926static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1927 IntrinsicInst &II) {
1928 // Replace DupX with a regular IR splat.
1929 auto *RetTy = cast<ScalableVectorType>(II.getType());
1930 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1931 II.getArgOperand(0));
1932 Splat->takeName(&II);
1933 return IC.replaceInstUsesWith(II, Splat);
1934}
1935
1936static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1937 IntrinsicInst &II) {
1938 LLVMContext &Ctx = II.getContext();
1939
1940 if (!isAllActivePredicate(II.getArgOperand(0)))
1941 return std::nullopt;
1942
1943 // Check that we have a compare of zero..
1944 auto *SplatValue =
1946 if (!SplatValue || !SplatValue->isZero())
1947 return std::nullopt;
1948
1949 // ..against a dupq
1950 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1951 if (!DupQLane ||
1952 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1953 return std::nullopt;
1954
1955 // Where the dupq is a lane 0 replicate of a vector insert
1956 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1957 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1958 return std::nullopt;
1959
1960 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1961 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1962 return std::nullopt;
1963
1964 // Where the vector insert is a fixed constant vector insert into undef at
1965 // index zero
1966 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1967 return std::nullopt;
1968
1969 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1970 return std::nullopt;
1971
1972 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1973 if (!ConstVec)
1974 return std::nullopt;
1975
1976 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1977 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1978 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1979 return std::nullopt;
1980
1981 unsigned NumElts = VecTy->getNumElements();
1982 unsigned PredicateBits = 0;
1983
1984 // Expand intrinsic operands to a 16-bit byte level predicate
1985 for (unsigned I = 0; I < NumElts; ++I) {
1986 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1987 if (!Arg)
1988 return std::nullopt;
1989 if (!Arg->isZero())
1990 PredicateBits |= 1 << (I * (16 / NumElts));
1991 }
1992
1993 // If all bits are zero bail early with an empty predicate
1994 if (PredicateBits == 0) {
1995 auto *PFalse = Constant::getNullValue(II.getType());
1996 PFalse->takeName(&II);
1997 return IC.replaceInstUsesWith(II, PFalse);
1998 }
1999
2000 // Calculate largest predicate type used (where byte predicate is largest)
2001 unsigned Mask = 8;
2002 for (unsigned I = 0; I < 16; ++I)
2003 if ((PredicateBits & (1 << I)) != 0)
2004 Mask |= (I % 8);
2005
2006 unsigned PredSize = Mask & -Mask;
2007 auto *PredType = ScalableVectorType::get(
2008 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2009
2010 // Ensure all relevant bits are set
2011 for (unsigned I = 0; I < 16; I += PredSize)
2012 if ((PredicateBits & (1 << I)) == 0)
2013 return std::nullopt;
2014
2015 auto *PTruePat =
2016 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2017 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2018 {PredType}, {PTruePat});
2019 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2020 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2021 auto *ConvertFromSVBool =
2022 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2023 {II.getType()}, {ConvertToSVBool});
2024
2025 ConvertFromSVBool->takeName(&II);
2026 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2027}
2028
2029static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2030 IntrinsicInst &II) {
2031 Value *Pg = II.getArgOperand(0);
2032 Value *Vec = II.getArgOperand(1);
2033 auto IntrinsicID = II.getIntrinsicID();
2034 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2035
2036 // lastX(splat(X)) --> X
2037 if (auto *SplatVal = getSplatValue(Vec))
2038 return IC.replaceInstUsesWith(II, SplatVal);
2039
2040 // If x and/or y is a splat value then:
2041 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2042 Value *LHS, *RHS;
2043 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2044 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2045 auto *OldBinOp = cast<BinaryOperator>(Vec);
2046 auto OpC = OldBinOp->getOpcode();
2047 auto *NewLHS =
2048 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2049 auto *NewRHS =
2050 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2052 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2053 return IC.replaceInstUsesWith(II, NewBinOp);
2054 }
2055 }
2056
2057 auto *C = dyn_cast<Constant>(Pg);
2058 if (IsAfter && C && C->isNullValue()) {
2059 // The intrinsic is extracting lane 0 so use an extract instead.
2060 auto *IdxTy = Type::getInt64Ty(II.getContext());
2061 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2062 Extract->insertBefore(II.getIterator());
2063 Extract->takeName(&II);
2064 return IC.replaceInstUsesWith(II, Extract);
2065 }
2066
2067 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2068 if (!IntrPG)
2069 return std::nullopt;
2070
2071 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2072 return std::nullopt;
2073
2074 const auto PTruePattern =
2075 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2076
2077 // Can the intrinsic's predicate be converted to a known constant index?
2078 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2079 if (!MinNumElts)
2080 return std::nullopt;
2081
2082 unsigned Idx = MinNumElts - 1;
2083 // Increment the index if extracting the element after the last active
2084 // predicate element.
2085 if (IsAfter)
2086 ++Idx;
2087
2088 // Ignore extracts whose index is larger than the known minimum vector
2089 // length. NOTE: This is an artificial constraint where we prefer to
2090 // maintain what the user asked for until an alternative is proven faster.
2091 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2092 if (Idx >= PgVTy->getMinNumElements())
2093 return std::nullopt;
2094
2095 // The intrinsic is extracting a fixed lane so use an extract instead.
2096 auto *IdxTy = Type::getInt64Ty(II.getContext());
2097 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2098 Extract->insertBefore(II.getIterator());
2099 Extract->takeName(&II);
2100 return IC.replaceInstUsesWith(II, Extract);
2101}
2102
2103static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2104 IntrinsicInst &II) {
2105 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2106 // integer variant across a variety of micro-architectures. Replace scalar
2107 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2108 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2109 // depending on the micro-architecture, but has been observed as generally
2110 // being faster, particularly when the CLAST[AB] op is a loop-carried
2111 // dependency.
2112 Value *Pg = II.getArgOperand(0);
2113 Value *Fallback = II.getArgOperand(1);
2114 Value *Vec = II.getArgOperand(2);
2115 Type *Ty = II.getType();
2116
2117 if (!Ty->isIntegerTy())
2118 return std::nullopt;
2119
2120 Type *FPTy;
2121 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2122 default:
2123 return std::nullopt;
2124 case 16:
2125 FPTy = IC.Builder.getHalfTy();
2126 break;
2127 case 32:
2128 FPTy = IC.Builder.getFloatTy();
2129 break;
2130 case 64:
2131 FPTy = IC.Builder.getDoubleTy();
2132 break;
2133 }
2134
2135 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2136 auto *FPVTy = VectorType::get(
2137 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2138 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2139 auto *FPII = IC.Builder.CreateIntrinsic(
2140 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2141 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2142 return IC.replaceInstUsesWith(II, FPIItoInt);
2143}
2144
2145static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2146 IntrinsicInst &II) {
2147 LLVMContext &Ctx = II.getContext();
2148 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2149 // can work with RDFFR_PP for ptest elimination.
2150 auto *AllPat =
2151 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2152 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2153 {II.getType()}, {AllPat});
2154 auto *RDFFR =
2155 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2156 RDFFR->takeName(&II);
2157 return IC.replaceInstUsesWith(II, RDFFR);
2158}
2159
2160static std::optional<Instruction *>
2162 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2163
2164 if (Pattern == AArch64SVEPredPattern::all) {
2166 II.getType(), ElementCount::getScalable(NumElts));
2167 Cnt->takeName(&II);
2168 return IC.replaceInstUsesWith(II, Cnt);
2169 }
2170
2171 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2172
2173 return MinNumElts && NumElts >= MinNumElts
2174 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2175 II, ConstantInt::get(II.getType(), MinNumElts)))
2176 : std::nullopt;
2177}
2178
2179static std::optional<Instruction *>
2181 const AArch64Subtarget *ST) {
2182 if (!ST->isStreaming())
2183 return std::nullopt;
2184
2185 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2186 // with SVEPredPattern::all
2187 Value *Cnt =
2189 Cnt->takeName(&II);
2190 return IC.replaceInstUsesWith(II, Cnt);
2191}
2192
2193static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2194 IntrinsicInst &II) {
2195 Value *PgVal = II.getArgOperand(0);
2196 Value *OpVal = II.getArgOperand(1);
2197
2198 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2199 // Later optimizations prefer this form.
2200 if (PgVal == OpVal &&
2201 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2202 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2203 Value *Ops[] = {PgVal, OpVal};
2204 Type *Tys[] = {PgVal->getType()};
2205
2206 auto *PTest =
2207 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2208 PTest->takeName(&II);
2209
2210 return IC.replaceInstUsesWith(II, PTest);
2211 }
2212
2215
2216 if (!Pg || !Op)
2217 return std::nullopt;
2218
2219 Intrinsic::ID OpIID = Op->getIntrinsicID();
2220
2221 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2222 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2223 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2224 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2225 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2226
2227 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2228
2229 PTest->takeName(&II);
2230 return IC.replaceInstUsesWith(II, PTest);
2231 }
2232
2233 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2234 // Later optimizations may rewrite sequence to use the flag-setting variant
2235 // of instruction X to remove PTEST.
2236 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2237 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2238 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2239 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2240 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2241 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2242 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2243 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2244 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2245 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2246 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2247 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2248 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2249 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2250 Type *Tys[] = {Pg->getType()};
2251
2252 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2253 PTest->takeName(&II);
2254
2255 return IC.replaceInstUsesWith(II, PTest);
2256 }
2257
2258 return std::nullopt;
2259}
2260
2261template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2262static std::optional<Instruction *>
2264 bool MergeIntoAddendOp) {
2265 Value *P = II.getOperand(0);
2266 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2267 if (MergeIntoAddendOp) {
2268 AddendOp = II.getOperand(1);
2269 Mul = II.getOperand(2);
2270 } else {
2271 AddendOp = II.getOperand(2);
2272 Mul = II.getOperand(1);
2273 }
2274
2276 m_Value(MulOp1))))
2277 return std::nullopt;
2278
2279 if (!Mul->hasOneUse())
2280 return std::nullopt;
2281
2282 Instruction *FMFSource = nullptr;
2283 if (II.getType()->isFPOrFPVectorTy()) {
2284 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2285 // Stop the combine when the flags on the inputs differ in case dropping
2286 // flags would lead to us missing out on more beneficial optimizations.
2287 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2288 return std::nullopt;
2289 if (!FAddFlags.allowContract())
2290 return std::nullopt;
2291 FMFSource = &II;
2292 }
2293
2294 CallInst *Res;
2295 if (MergeIntoAddendOp)
2296 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2297 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2298 else
2299 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2300 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2301
2302 return IC.replaceInstUsesWith(II, Res);
2303}
2304
2305static std::optional<Instruction *>
2307 Value *Pred = II.getOperand(0);
2308 Value *PtrOp = II.getOperand(1);
2309 Type *VecTy = II.getType();
2310
2311 if (isAllActivePredicate(Pred)) {
2312 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2313 Load->copyMetadata(II);
2314 return IC.replaceInstUsesWith(II, Load);
2315 }
2316
2317 CallInst *MaskedLoad =
2318 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2319 Pred, ConstantAggregateZero::get(VecTy));
2320 MaskedLoad->copyMetadata(II);
2321 return IC.replaceInstUsesWith(II, MaskedLoad);
2322}
2323
2324static std::optional<Instruction *>
2326 Value *VecOp = II.getOperand(0);
2327 Value *Pred = II.getOperand(1);
2328 Value *PtrOp = II.getOperand(2);
2329
2330 if (isAllActivePredicate(Pred)) {
2331 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2332 Store->copyMetadata(II);
2333 return IC.eraseInstFromFunction(II);
2334 }
2335
2336 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2337 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2338 MaskedStore->copyMetadata(II);
2339 return IC.eraseInstFromFunction(II);
2340}
2341
2343 switch (Intrinsic) {
2344 case Intrinsic::aarch64_sve_fmul_u:
2345 return Instruction::BinaryOps::FMul;
2346 case Intrinsic::aarch64_sve_fadd_u:
2347 return Instruction::BinaryOps::FAdd;
2348 case Intrinsic::aarch64_sve_fsub_u:
2349 return Instruction::BinaryOps::FSub;
2350 default:
2351 return Instruction::BinaryOpsEnd;
2352 }
2353}
2354
2355static std::optional<Instruction *>
2357 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2358 if (II.isStrictFP())
2359 return std::nullopt;
2360
2361 auto *OpPredicate = II.getOperand(0);
2362 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2363 if (BinOpCode == Instruction::BinaryOpsEnd ||
2364 !isAllActivePredicate(OpPredicate))
2365 return std::nullopt;
2366 auto BinOp = IC.Builder.CreateBinOpFMF(
2367 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2368 return IC.replaceInstUsesWith(II, BinOp);
2369}
2370
2371static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2372 IntrinsicInst &II) {
2373 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2374 Intrinsic::aarch64_sve_mla>(
2375 IC, II, true))
2376 return MLA;
2377 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2378 Intrinsic::aarch64_sve_mad>(
2379 IC, II, false))
2380 return MAD;
2381 return std::nullopt;
2382}
2383
2384static std::optional<Instruction *>
2386 if (auto FMLA =
2387 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2388 Intrinsic::aarch64_sve_fmla>(IC, II,
2389 true))
2390 return FMLA;
2391 if (auto FMAD =
2392 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2393 Intrinsic::aarch64_sve_fmad>(IC, II,
2394 false))
2395 return FMAD;
2396 if (auto FMLA =
2397 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2398 Intrinsic::aarch64_sve_fmla>(IC, II,
2399 true))
2400 return FMLA;
2401 return std::nullopt;
2402}
2403
2404static std::optional<Instruction *>
2406 if (auto FMLA =
2407 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2408 Intrinsic::aarch64_sve_fmla>(IC, II,
2409 true))
2410 return FMLA;
2411 if (auto FMAD =
2412 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2413 Intrinsic::aarch64_sve_fmad>(IC, II,
2414 false))
2415 return FMAD;
2416 if (auto FMLA_U =
2417 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2418 Intrinsic::aarch64_sve_fmla_u>(
2419 IC, II, true))
2420 return FMLA_U;
2421 return instCombineSVEVectorBinOp(IC, II);
2422}
2423
2424static std::optional<Instruction *>
2426 if (auto FMLS =
2427 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2428 Intrinsic::aarch64_sve_fmls>(IC, II,
2429 true))
2430 return FMLS;
2431 if (auto FMSB =
2432 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2433 Intrinsic::aarch64_sve_fnmsb>(
2434 IC, II, false))
2435 return FMSB;
2436 if (auto FMLS =
2437 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2438 Intrinsic::aarch64_sve_fmls>(IC, II,
2439 true))
2440 return FMLS;
2441 return std::nullopt;
2442}
2443
2444static std::optional<Instruction *>
2446 if (auto FMLS =
2447 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2448 Intrinsic::aarch64_sve_fmls>(IC, II,
2449 true))
2450 return FMLS;
2451 if (auto FMSB =
2452 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2453 Intrinsic::aarch64_sve_fnmsb>(
2454 IC, II, false))
2455 return FMSB;
2456 if (auto FMLS_U =
2457 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2458 Intrinsic::aarch64_sve_fmls_u>(
2459 IC, II, true))
2460 return FMLS_U;
2461 return instCombineSVEVectorBinOp(IC, II);
2462}
2463
2464static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2465 IntrinsicInst &II) {
2466 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2467 Intrinsic::aarch64_sve_mls>(
2468 IC, II, true))
2469 return MLS;
2470 return std::nullopt;
2471}
2472
2473static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2474 IntrinsicInst &II) {
2475 Value *UnpackArg = II.getArgOperand(0);
2476 auto *RetTy = cast<ScalableVectorType>(II.getType());
2477 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2478 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2479
2480 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2481 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2482 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2483 ScalarArg =
2484 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2485 Value *NewVal =
2486 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2487 NewVal->takeName(&II);
2488 return IC.replaceInstUsesWith(II, NewVal);
2489 }
2490
2491 return std::nullopt;
2492}
2493static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2494 IntrinsicInst &II) {
2495 auto *OpVal = II.getOperand(0);
2496 auto *OpIndices = II.getOperand(1);
2497 VectorType *VTy = cast<VectorType>(II.getType());
2498
2499 // Check whether OpIndices is a constant splat value < minimal element count
2500 // of result.
2501 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2502 if (!SplatValue ||
2503 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2504 return std::nullopt;
2505
2506 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2507 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2508 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2509 auto *VectorSplat =
2510 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2511
2512 VectorSplat->takeName(&II);
2513 return IC.replaceInstUsesWith(II, VectorSplat);
2514}
2515
2516static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2517 IntrinsicInst &II) {
2518 Value *A, *B;
2519 Type *RetTy = II.getType();
2520 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2521 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2522
2523 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2524 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2525 if ((match(II.getArgOperand(0),
2527 match(II.getArgOperand(1),
2529 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2530 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2531 auto *TyA = cast<ScalableVectorType>(A->getType());
2532 if (TyA == B->getType() &&
2534 auto *SubVec = IC.Builder.CreateInsertVector(
2535 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2536 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2537 TyA->getMinNumElements());
2538 ConcatVec->takeName(&II);
2539 return IC.replaceInstUsesWith(II, ConcatVec);
2540 }
2541 }
2542
2543 return std::nullopt;
2544}
2545
2546static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2547 IntrinsicInst &II) {
2548 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2549 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2550 Value *A, *B;
2551 if (match(II.getArgOperand(0),
2554 m_Specific(A), m_Specific(B))))
2555 return IC.replaceInstUsesWith(
2556 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2557
2558 return std::nullopt;
2559}
2560
2561static std::optional<Instruction *>
2563 Value *Mask = II.getOperand(0);
2564 Value *BasePtr = II.getOperand(1);
2565 Value *Index = II.getOperand(2);
2566 Type *Ty = II.getType();
2567 Value *PassThru = ConstantAggregateZero::get(Ty);
2568
2569 // Contiguous gather => masked load.
2570 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2571 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2572 Value *IndexBase;
2574 m_Value(IndexBase), m_SpecificInt(1)))) {
2575 Align Alignment =
2576 BasePtr->getPointerAlignment(II.getDataLayout());
2577
2578 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2579 BasePtr, IndexBase);
2580 CallInst *MaskedLoad =
2581 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2582 MaskedLoad->takeName(&II);
2583 return IC.replaceInstUsesWith(II, MaskedLoad);
2584 }
2585
2586 return std::nullopt;
2587}
2588
2589static std::optional<Instruction *>
2591 Value *Val = II.getOperand(0);
2592 Value *Mask = II.getOperand(1);
2593 Value *BasePtr = II.getOperand(2);
2594 Value *Index = II.getOperand(3);
2595 Type *Ty = Val->getType();
2596
2597 // Contiguous scatter => masked store.
2598 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2599 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2600 Value *IndexBase;
2602 m_Value(IndexBase), m_SpecificInt(1)))) {
2603 Align Alignment =
2604 BasePtr->getPointerAlignment(II.getDataLayout());
2605
2606 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2607 BasePtr, IndexBase);
2608 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2609
2610 return IC.eraseInstFromFunction(II);
2611 }
2612
2613 return std::nullopt;
2614}
2615
2616static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2617 IntrinsicInst &II) {
2619 Value *Pred = II.getOperand(0);
2620 Value *Vec = II.getOperand(1);
2621 Value *DivVec = II.getOperand(2);
2622
2623 Value *SplatValue = getSplatValue(DivVec);
2624 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2625 if (!SplatConstantInt)
2626 return std::nullopt;
2627
2628 APInt Divisor = SplatConstantInt->getValue();
2629 const int64_t DivisorValue = Divisor.getSExtValue();
2630 if (DivisorValue == -1)
2631 return std::nullopt;
2632 if (DivisorValue == 1)
2633 IC.replaceInstUsesWith(II, Vec);
2634
2635 if (Divisor.isPowerOf2()) {
2636 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2637 auto ASRD = IC.Builder.CreateIntrinsic(
2638 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2639 return IC.replaceInstUsesWith(II, ASRD);
2640 }
2641 if (Divisor.isNegatedPowerOf2()) {
2642 Divisor.negate();
2643 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2644 auto ASRD = IC.Builder.CreateIntrinsic(
2645 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2646 auto NEG = IC.Builder.CreateIntrinsic(
2647 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2648 return IC.replaceInstUsesWith(II, NEG);
2649 }
2650
2651 return std::nullopt;
2652}
2653
2654bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2655 size_t VecSize = Vec.size();
2656 if (VecSize == 1)
2657 return true;
2658 if (!isPowerOf2_64(VecSize))
2659 return false;
2660 size_t HalfVecSize = VecSize / 2;
2661
2662 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2663 RHS != Vec.end(); LHS++, RHS++) {
2664 if (*LHS != nullptr && *RHS != nullptr) {
2665 if (*LHS == *RHS)
2666 continue;
2667 else
2668 return false;
2669 }
2670 if (!AllowPoison)
2671 return false;
2672 if (*LHS == nullptr && *RHS != nullptr)
2673 *LHS = *RHS;
2674 }
2675
2676 Vec.resize(HalfVecSize);
2677 SimplifyValuePattern(Vec, AllowPoison);
2678 return true;
2679}
2680
2681// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2682// to dupqlane(f64(C)) where C is A concatenated with B
2683static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2684 IntrinsicInst &II) {
2685 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2686 if (!match(II.getOperand(0),
2688 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2689 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2690 return std::nullopt;
2691 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2692
2693 // Insert the scalars into a container ordered by InsertElement index
2694 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2695 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2696 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2697 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2698 CurrentInsertElt = InsertElt->getOperand(0);
2699 }
2700
2701 bool AllowPoison =
2702 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2703 if (!SimplifyValuePattern(Elts, AllowPoison))
2704 return std::nullopt;
2705
2706 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2707 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2708 for (size_t I = 0; I < Elts.size(); I++) {
2709 if (Elts[I] == nullptr)
2710 continue;
2711 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2712 IC.Builder.getInt64(I));
2713 }
2714 if (InsertEltChain == nullptr)
2715 return std::nullopt;
2716
2717 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2718 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2719 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2720 // be narrowed back to the original type.
2721 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2722 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2723 IIScalableTy->getMinNumElements() /
2724 PatternWidth;
2725
2726 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2727 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2728 auto *WideShuffleMaskTy =
2729 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2730
2731 auto InsertSubvector = IC.Builder.CreateInsertVector(
2732 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2733 uint64_t(0));
2734 auto WideBitcast =
2735 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2736 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2737 auto WideShuffle = IC.Builder.CreateShuffleVector(
2738 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2739 auto NarrowBitcast =
2740 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2741
2742 return IC.replaceInstUsesWith(II, NarrowBitcast);
2743}
2744
2745static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2746 IntrinsicInst &II) {
2747 Value *A = II.getArgOperand(0);
2748 Value *B = II.getArgOperand(1);
2749 if (A == B)
2750 return IC.replaceInstUsesWith(II, A);
2751
2752 return std::nullopt;
2753}
2754
2755static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2756 IntrinsicInst &II) {
2757 Value *Pred = II.getOperand(0);
2758 Value *Vec = II.getOperand(1);
2759 Value *Shift = II.getOperand(2);
2760
2761 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2762 Value *AbsPred, *MergedValue;
2764 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2766 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2767
2768 return std::nullopt;
2769
2770 // Transform is valid if any of the following are true:
2771 // * The ABS merge value is an undef or non-negative
2772 // * The ABS predicate is all active
2773 // * The ABS predicate and the SRSHL predicates are the same
2774 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2775 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2776 return std::nullopt;
2777
2778 // Only valid when the shift amount is non-negative, otherwise the rounding
2779 // behaviour of SRSHL cannot be ignored.
2780 if (!match(Shift, m_NonNegative()))
2781 return std::nullopt;
2782
2783 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2784 {II.getType()}, {Pred, Vec, Shift});
2785
2786 return IC.replaceInstUsesWith(II, LSL);
2787}
2788
2789static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2790 IntrinsicInst &II) {
2791 Value *Vec = II.getOperand(0);
2792
2793 if (getSplatValue(Vec) == II.getOperand(1))
2794 return IC.replaceInstUsesWith(II, Vec);
2795
2796 return std::nullopt;
2797}
2798
2799static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2800 IntrinsicInst &II) {
2801 // If this barrier is post-dominated by identical one we can remove it
2802 auto *NI = II.getNextNode();
2803 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2804 auto CanSkipOver = [](Instruction *I) {
2805 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2806 };
2807 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2808 auto *NIBB = NI->getParent();
2809 NI = NI->getNextNode();
2810 if (!NI) {
2811 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2812 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2813 else
2814 break;
2815 }
2816 }
2817 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2818 if (NextII && II.isIdenticalTo(NextII))
2819 return IC.eraseInstFromFunction(II);
2820
2821 return std::nullopt;
2822}
2823
2824static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2825 IntrinsicInst &II) {
2826 return IC.replaceInstUsesWith(
2827 II,
2828 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2829 {II.getType(), II.getOperand(0)->getType()},
2830 {II.getOperand(0), II.getOperand(1)}));
2831}
2832
2833static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2834 IntrinsicInst &II) {
2836 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2837 return std::nullopt;
2838}
2839
2840static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2842 unsigned NumBits) {
2843 Value *Passthru = II.getOperand(0);
2844 Value *Pg = II.getOperand(1);
2845 Value *Op = II.getOperand(2);
2846
2847 // Convert UXT[BHW] to AND.
2848 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2849 auto *Ty = cast<VectorType>(II.getType());
2850 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2851 auto *Mask = ConstantInt::get(Ty, MaskValue);
2852 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2853 {Pg, Op, Mask});
2854 return IC.replaceInstUsesWith(II, And);
2855 }
2856
2857 return std::nullopt;
2858}
2859
2860static std::optional<Instruction *>
2862 SMEAttrs FnSMEAttrs(*II.getFunction());
2863 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2864 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2865 return IC.replaceInstUsesWith(
2866 II, ConstantInt::getBool(II.getType(), IsStreaming));
2867 return std::nullopt;
2868}
2869
2870std::optional<Instruction *>
2872 IntrinsicInst &II) const {
2874 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2875 return I;
2876
2877 Intrinsic::ID IID = II.getIntrinsicID();
2878 switch (IID) {
2879 default:
2880 break;
2881 case Intrinsic::aarch64_dmb:
2882 return instCombineDMB(IC, II);
2883 case Intrinsic::aarch64_neon_fmaxnm:
2884 case Intrinsic::aarch64_neon_fminnm:
2885 return instCombineMaxMinNM(IC, II);
2886 case Intrinsic::aarch64_sve_convert_from_svbool:
2887 return instCombineConvertFromSVBool(IC, II);
2888 case Intrinsic::aarch64_sve_dup:
2889 return instCombineSVEDup(IC, II);
2890 case Intrinsic::aarch64_sve_dup_x:
2891 return instCombineSVEDupX(IC, II);
2892 case Intrinsic::aarch64_sve_cmpne:
2893 case Intrinsic::aarch64_sve_cmpne_wide:
2894 return instCombineSVECmpNE(IC, II);
2895 case Intrinsic::aarch64_sve_rdffr:
2896 return instCombineRDFFR(IC, II);
2897 case Intrinsic::aarch64_sve_lasta:
2898 case Intrinsic::aarch64_sve_lastb:
2899 return instCombineSVELast(IC, II);
2900 case Intrinsic::aarch64_sve_clasta_n:
2901 case Intrinsic::aarch64_sve_clastb_n:
2902 return instCombineSVECondLast(IC, II);
2903 case Intrinsic::aarch64_sve_cntd:
2904 return instCombineSVECntElts(IC, II, 2);
2905 case Intrinsic::aarch64_sve_cntw:
2906 return instCombineSVECntElts(IC, II, 4);
2907 case Intrinsic::aarch64_sve_cnth:
2908 return instCombineSVECntElts(IC, II, 8);
2909 case Intrinsic::aarch64_sve_cntb:
2910 return instCombineSVECntElts(IC, II, 16);
2911 case Intrinsic::aarch64_sme_cntsd:
2912 return instCombineSMECntsd(IC, II, ST);
2913 case Intrinsic::aarch64_sve_ptest_any:
2914 case Intrinsic::aarch64_sve_ptest_first:
2915 case Intrinsic::aarch64_sve_ptest_last:
2916 return instCombineSVEPTest(IC, II);
2917 case Intrinsic::aarch64_sve_fadd:
2918 return instCombineSVEVectorFAdd(IC, II);
2919 case Intrinsic::aarch64_sve_fadd_u:
2920 return instCombineSVEVectorFAddU(IC, II);
2921 case Intrinsic::aarch64_sve_fmul_u:
2922 return instCombineSVEVectorBinOp(IC, II);
2923 case Intrinsic::aarch64_sve_fsub:
2924 return instCombineSVEVectorFSub(IC, II);
2925 case Intrinsic::aarch64_sve_fsub_u:
2926 return instCombineSVEVectorFSubU(IC, II);
2927 case Intrinsic::aarch64_sve_add:
2928 return instCombineSVEVectorAdd(IC, II);
2929 case Intrinsic::aarch64_sve_add_u:
2930 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2931 Intrinsic::aarch64_sve_mla_u>(
2932 IC, II, true);
2933 case Intrinsic::aarch64_sve_sub:
2934 return instCombineSVEVectorSub(IC, II);
2935 case Intrinsic::aarch64_sve_sub_u:
2936 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2937 Intrinsic::aarch64_sve_mls_u>(
2938 IC, II, true);
2939 case Intrinsic::aarch64_sve_tbl:
2940 return instCombineSVETBL(IC, II);
2941 case Intrinsic::aarch64_sve_uunpkhi:
2942 case Intrinsic::aarch64_sve_uunpklo:
2943 case Intrinsic::aarch64_sve_sunpkhi:
2944 case Intrinsic::aarch64_sve_sunpklo:
2945 return instCombineSVEUnpack(IC, II);
2946 case Intrinsic::aarch64_sve_uzp1:
2947 return instCombineSVEUzp1(IC, II);
2948 case Intrinsic::aarch64_sve_zip1:
2949 case Intrinsic::aarch64_sve_zip2:
2950 return instCombineSVEZip(IC, II);
2951 case Intrinsic::aarch64_sve_ld1_gather_index:
2952 return instCombineLD1GatherIndex(IC, II);
2953 case Intrinsic::aarch64_sve_st1_scatter_index:
2954 return instCombineST1ScatterIndex(IC, II);
2955 case Intrinsic::aarch64_sve_ld1:
2956 return instCombineSVELD1(IC, II, DL);
2957 case Intrinsic::aarch64_sve_st1:
2958 return instCombineSVEST1(IC, II, DL);
2959 case Intrinsic::aarch64_sve_sdiv:
2960 return instCombineSVESDIV(IC, II);
2961 case Intrinsic::aarch64_sve_sel:
2962 return instCombineSVESel(IC, II);
2963 case Intrinsic::aarch64_sve_srshl:
2964 return instCombineSVESrshl(IC, II);
2965 case Intrinsic::aarch64_sve_dupq_lane:
2966 return instCombineSVEDupqLane(IC, II);
2967 case Intrinsic::aarch64_sve_insr:
2968 return instCombineSVEInsr(IC, II);
2969 case Intrinsic::aarch64_sve_whilelo:
2970 return instCombineWhilelo(IC, II);
2971 case Intrinsic::aarch64_sve_ptrue:
2972 return instCombinePTrue(IC, II);
2973 case Intrinsic::aarch64_sve_uxtb:
2974 return instCombineSVEUxt(IC, II, 8);
2975 case Intrinsic::aarch64_sve_uxth:
2976 return instCombineSVEUxt(IC, II, 16);
2977 case Intrinsic::aarch64_sve_uxtw:
2978 return instCombineSVEUxt(IC, II, 32);
2979 case Intrinsic::aarch64_sme_in_streaming_mode:
2980 return instCombineInStreamingMode(IC, II);
2981 }
2982
2983 return std::nullopt;
2984}
2985
2987 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2988 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2989 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2990 SimplifyAndSetOp) const {
2991 switch (II.getIntrinsicID()) {
2992 default:
2993 break;
2994 case Intrinsic::aarch64_neon_fcvtxn:
2995 case Intrinsic::aarch64_neon_rshrn:
2996 case Intrinsic::aarch64_neon_sqrshrn:
2997 case Intrinsic::aarch64_neon_sqrshrun:
2998 case Intrinsic::aarch64_neon_sqshrn:
2999 case Intrinsic::aarch64_neon_sqshrun:
3000 case Intrinsic::aarch64_neon_sqxtn:
3001 case Intrinsic::aarch64_neon_sqxtun:
3002 case Intrinsic::aarch64_neon_uqrshrn:
3003 case Intrinsic::aarch64_neon_uqshrn:
3004 case Intrinsic::aarch64_neon_uqxtn:
3005 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3006 break;
3007 }
3008
3009 return std::nullopt;
3010}
3011
3013 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3015}
3016
3019 switch (K) {
3021 return TypeSize::getFixed(64);
3023 if (ST->useSVEForFixedLengthVectors() &&
3024 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3025 return TypeSize::getFixed(
3026 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3027 else if (ST->isNeonAvailable())
3028 return TypeSize::getFixed(128);
3029 else
3030 return TypeSize::getFixed(0);
3032 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3034 return TypeSize::getScalable(128);
3035 else
3036 return TypeSize::getScalable(0);
3037 }
3038 llvm_unreachable("Unsupported register kind");
3039}
3040
3041bool AArch64TTIImpl::isSingleExtWideningInstruction(
3042 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3043 Type *SrcOverrideTy) const {
3044 // A helper that returns a vector type from the given type. The number of
3045 // elements in type Ty determines the vector width.
3046 auto toVectorTy = [&](Type *ArgTy) {
3047 return VectorType::get(ArgTy->getScalarType(),
3048 cast<VectorType>(DstTy)->getElementCount());
3049 };
3050
3051 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3052 // i32, i64]. SVE doesn't generally have the same set of instructions to
3053 // perform an extend with the add/sub/mul. There are SMULLB style
3054 // instructions, but they operate on top/bottom, requiring some sort of lane
3055 // interleaving to be used with zext/sext.
3056 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3057 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3058 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3059 return false;
3060
3061 Type *SrcTy = SrcOverrideTy;
3062 switch (Opcode) {
3063 case Instruction::Add: // UADDW(2), SADDW(2).
3064 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3065 // The second operand needs to be an extend
3066 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3067 if (!SrcTy)
3068 SrcTy =
3069 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3070 break;
3071 }
3072
3073 if (Opcode == Instruction::Sub)
3074 return false;
3075
3076 // UADDW(2), SADDW(2) can be commutted.
3077 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3078 if (!SrcTy)
3079 SrcTy =
3080 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3081 break;
3082 }
3083 return false;
3084 }
3085 default:
3086 return false;
3087 }
3088
3089 // Legalize the destination type and ensure it can be used in a widening
3090 // operation.
3091 auto DstTyL = getTypeLegalizationCost(DstTy);
3092 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3093 return false;
3094
3095 // Legalize the source type and ensure it can be used in a widening
3096 // operation.
3097 assert(SrcTy && "Expected some SrcTy");
3098 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3099 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3100 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3101 return false;
3102
3103 // Get the total number of vector elements in the legalized types.
3104 InstructionCost NumDstEls =
3105 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3106 InstructionCost NumSrcEls =
3107 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3108
3109 // Return true if the legalized types have the same number of vector elements
3110 // and the destination element type size is twice that of the source type.
3111 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3112}
3113
3114Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3116 Type *SrcOverrideTy) const {
3117 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3118 Opcode != Instruction::Mul)
3119 return nullptr;
3120
3121 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3122 // i32, i64]. SVE doesn't generally have the same set of instructions to
3123 // perform an extend with the add/sub/mul. There are SMULLB style
3124 // instructions, but they operate on top/bottom, requiring some sort of lane
3125 // interleaving to be used with zext/sext.
3126 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3127 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3128 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3129 return nullptr;
3130
3131 auto getScalarSizeWithOverride = [&](const Value *V) {
3132 if (SrcOverrideTy)
3133 return SrcOverrideTy->getScalarSizeInBits();
3134 return cast<Instruction>(V)
3135 ->getOperand(0)
3136 ->getType()
3137 ->getScalarSizeInBits();
3138 };
3139
3140 unsigned MaxEltSize = 0;
3141 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3142 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3143 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3144 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3145 MaxEltSize = std::max(EltSize0, EltSize1);
3146 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3147 isa<SExtInst, ZExtInst>(Args[1])) {
3148 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3149 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3150 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3151 // enough.
3152 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3153 return nullptr;
3154 MaxEltSize = DstEltSize / 2;
3155 } else if (Opcode == Instruction::Mul &&
3156 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3157 // If one of the operands is a Zext and the other has enough zero bits
3158 // to be treated as unsigned, we can still generate a umull, meaning the
3159 // zext is free.
3160 KnownBits Known =
3161 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3162 if (Args[0]->getType()->getScalarSizeInBits() -
3163 Known.Zero.countLeadingOnes() >
3164 DstTy->getScalarSizeInBits() / 2)
3165 return nullptr;
3166
3167 MaxEltSize =
3168 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3169 } else
3170 return nullptr;
3171
3172 if (MaxEltSize * 2 > DstEltSize)
3173 return nullptr;
3174
3175 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3176 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3177 return nullptr;
3178 return ExtTy;
3179}
3180
3181// s/urhadd instructions implement the following pattern, making the
3182// extends free:
3183// %x = add ((zext i8 -> i16), 1)
3184// %y = (zext i8 -> i16)
3185// trunc i16 (lshr (add %x, %y), 1) -> i8
3186//
3188 Type *Src) const {
3189 // The source should be a legal vector type.
3190 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3191 (Src->isScalableTy() && !ST->hasSVE2()))
3192 return false;
3193
3194 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3195 return false;
3196
3197 // Look for trunc/shl/add before trying to match the pattern.
3198 const Instruction *Add = ExtUser;
3199 auto *AddUser =
3200 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3201 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3202 Add = AddUser;
3203
3204 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3205 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3206 return false;
3207
3208 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3209 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3210 Src->getScalarSizeInBits() !=
3211 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3212 return false;
3213
3214 // Try to match the whole pattern. Ext could be either the first or second
3215 // m_ZExtOrSExt matched.
3216 Instruction *Ex1, *Ex2;
3217 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3218 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3219 return false;
3220
3221 // Ensure both extends are of the same type
3222 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3223 Ex1->getOpcode() == Ex2->getOpcode())
3224 return true;
3225
3226 return false;
3227}
3228
3230 Type *Src,
3233 const Instruction *I) const {
3234 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3235 assert(ISD && "Invalid opcode");
3236 // If the cast is observable, and it is used by a widening instruction (e.g.,
3237 // uaddl, saddw, etc.), it may be free.
3238 if (I && I->hasOneUser()) {
3239 auto *SingleUser = cast<Instruction>(*I->user_begin());
3240 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3241 if (Type *ExtTy = isBinExtWideningInstruction(
3242 SingleUser->getOpcode(), Dst, Operands,
3243 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3244 // The cost from Src->Src*2 needs to be added if required, the cost from
3245 // Src*2->ExtTy is free.
3246 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3247 Type *DoubleSrcTy =
3248 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3249 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3251 }
3252
3253 return 0;
3254 }
3255
3256 if (isSingleExtWideningInstruction(
3257 SingleUser->getOpcode(), Dst, Operands,
3258 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3259 // For adds only count the second operand as free if both operands are
3260 // extends but not the same operation. (i.e both operands are not free in
3261 // add(sext, zext)).
3262 if (SingleUser->getOpcode() == Instruction::Add) {
3263 if (I == SingleUser->getOperand(1) ||
3264 (isa<CastInst>(SingleUser->getOperand(1)) &&
3265 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3266 return 0;
3267 } else {
3268 // Others are free so long as isSingleExtWideningInstruction
3269 // returned true.
3270 return 0;
3271 }
3272 }
3273
3274 // The cast will be free for the s/urhadd instructions
3275 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3276 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3277 return 0;
3278 }
3279
3280 // TODO: Allow non-throughput costs that aren't binary.
3281 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3283 return Cost == 0 ? 0 : 1;
3284 return Cost;
3285 };
3286
3287 EVT SrcTy = TLI->getValueType(DL, Src);
3288 EVT DstTy = TLI->getValueType(DL, Dst);
3289
3290 if (!SrcTy.isSimple() || !DstTy.isSimple())
3291 return AdjustCost(
3292 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3293
3294 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3295 // we use fcvtx under SVE2. Give them invalid costs.
3296 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3297 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3298 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3300
3301 static const TypeConversionCostTblEntry BF16Tbl[] = {
3302 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3303 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3304 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3305 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3306 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3307 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3308 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3309 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3310 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3311 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3312 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3313 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3314 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3315 };
3316
3317 if (ST->hasBF16())
3318 if (const auto *Entry = ConvertCostTableLookup(
3319 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3320 return AdjustCost(Entry->Cost);
3321
3322 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3323 // The cost of unpacking twice is artificially increased for now in order
3324 // to avoid regressions against NEON, which will use tbl instructions directly
3325 // instead of multiple layers of [s|u]unpk[lo|hi].
3326 // We use the unpacks in cases where the destination type is illegal and
3327 // requires splitting of the input, even if the input type itself is legal.
3328 const unsigned int SVE_EXT_COST = 1;
3329 const unsigned int SVE_FCVT_COST = 1;
3330 const unsigned int SVE_UNPACK_ONCE = 4;
3331 const unsigned int SVE_UNPACK_TWICE = 16;
3332
3333 static const TypeConversionCostTblEntry ConversionTbl[] = {
3334 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3335 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3336 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3337 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3338 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3339 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3340 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3341 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3342 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3343 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3344 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3345 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3346 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3347 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3348 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3349 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3350 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3351 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3352 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3353 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3354
3355 // Truncations on nxvmiN
3356 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3357 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3358 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3359 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3360 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3361 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3362 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3363 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3364 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3365 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3366 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3367 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3368 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3369 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3370 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3371 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3372 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3373 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3374 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3375 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3376 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3377 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3378 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3379 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3380 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3381 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3382 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3383 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3384 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3385 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3386 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3387 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3388 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3389
3390 // The number of shll instructions for the extension.
3391 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3392 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3393 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3394 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3395 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3396 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3397 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3398 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3399 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3400 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3401 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3402 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3403 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3404 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3405 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3406 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3407
3408 // FP Ext and trunc
3409 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3410 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3411 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3412 // FP16
3413 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3414 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3415 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3416 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3417 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3418 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3419 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3420 // BF16 (uses shift)
3421 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3422 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3423 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3424 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3425 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3426 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3427 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3428 // FP Ext and trunc
3429 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3430 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3431 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3432 // FP16
3433 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3434 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3435 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3436 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3437 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3438 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3439 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3440 // BF16 (more complex, with +bf16 is handled above)
3441 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3442 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3443 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3444 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3445 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3446 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3447 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3448 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3449
3450 // LowerVectorINT_TO_FP:
3451 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3452 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3453 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3454 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3455 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3456 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3457
3458 // SVE: to nxv2f16
3459 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3460 SVE_EXT_COST + SVE_FCVT_COST},
3461 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3462 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3463 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3464 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3465 SVE_EXT_COST + SVE_FCVT_COST},
3466 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3467 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3468 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3469
3470 // SVE: to nxv4f16
3471 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3472 SVE_EXT_COST + SVE_FCVT_COST},
3473 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3474 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3475 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3476 SVE_EXT_COST + SVE_FCVT_COST},
3477 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3478 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3479
3480 // SVE: to nxv8f16
3481 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3482 SVE_EXT_COST + SVE_FCVT_COST},
3483 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3484 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3485 SVE_EXT_COST + SVE_FCVT_COST},
3486 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3487
3488 // SVE: to nxv16f16
3489 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3490 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3491 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3492 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3493
3494 // Complex: to v2f32
3495 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3496 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3497 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3498 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3499
3500 // SVE: to nxv2f32
3501 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3502 SVE_EXT_COST + SVE_FCVT_COST},
3503 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3504 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3505 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3506 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3507 SVE_EXT_COST + SVE_FCVT_COST},
3508 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3509 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3510 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3511
3512 // Complex: to v4f32
3513 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3514 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3515 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3516 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3517
3518 // SVE: to nxv4f32
3519 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3520 SVE_EXT_COST + SVE_FCVT_COST},
3521 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3522 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3523 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3524 SVE_EXT_COST + SVE_FCVT_COST},
3525 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3526 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3527
3528 // Complex: to v8f32
3529 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3530 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3531 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3532 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3533
3534 // SVE: to nxv8f32
3535 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3536 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3537 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3538 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3539 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3540 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3541 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3542 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3543
3544 // SVE: to nxv16f32
3545 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3546 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3547 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3548 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3549
3550 // Complex: to v16f32
3551 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3552 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3553
3554 // Complex: to v2f64
3555 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3556 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3557 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3558 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3559 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3560 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3561
3562 // SVE: to nxv2f64
3563 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3564 SVE_EXT_COST + SVE_FCVT_COST},
3565 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3566 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3567 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3568 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3569 SVE_EXT_COST + SVE_FCVT_COST},
3570 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3571 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3572 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3573
3574 // Complex: to v4f64
3575 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3576 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3577
3578 // SVE: to nxv4f64
3579 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3580 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3581 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3582 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3583 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3584 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3585 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3586 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3587 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3588 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3589 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3590 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3591
3592 // SVE: to nxv8f64
3593 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3594 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3595 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3596 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3597 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3598 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3599 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3600 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3601
3602 // LowerVectorFP_TO_INT
3603 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3604 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3605 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3606 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3607 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3608 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3609
3610 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3611 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3612 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3613 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3614 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3615 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3616 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3617
3618 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3619 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3620 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3621 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3622 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3623
3624 // Complex, from nxv2f32.
3625 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3626 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3627 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3628 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3629 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3630 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3631 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3632 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3633
3634 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3635 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3636 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3637 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3638 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3639 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3640 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3641
3642 // Complex, from nxv2f64.
3643 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3644 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3645 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3646 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3647 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3648 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3649 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3650 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3651 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3652 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3653
3654 // Complex, from nxv4f32.
3655 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3656 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3657 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3658 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3659 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3660 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3661 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3662 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3663 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3664 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3665
3666 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3667 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3668 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3669 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3670 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3671
3672 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3673 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3674 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3675 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3676 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3677 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3678 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3679
3680 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3681 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3682 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3683 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3684 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3685
3686 // Complex, from nxv8f16.
3687 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3688 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3689 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3690 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3691 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3692 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3693 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3694 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3695 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3696 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3697
3698 // Complex, from nxv4f16.
3699 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3700 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3701 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3702 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3703 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3704 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3705 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3706 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3707
3708 // Complex, from nxv2f16.
3709 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3710 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3711 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3712 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3713 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3714 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3715 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3716 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3717
3718 // Truncate from nxvmf32 to nxvmf16.
3719 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3720 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3721 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3722
3723 // Truncate from nxvmf32 to nxvmbf16.
3724 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3725 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3726 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3727
3728 // Truncate from nxvmf64 to nxvmf16.
3729 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3730 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3731 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3732
3733 // Truncate from nxvmf64 to nxvmbf16.
3734 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3735 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3736 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3737
3738 // Truncate from nxvmf64 to nxvmf32.
3739 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3740 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3741 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3742
3743 // Extend from nxvmf16 to nxvmf32.
3744 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3745 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3746 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3747
3748 // Extend from nxvmbf16 to nxvmf32.
3749 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3750 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3751 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3752
3753 // Extend from nxvmf16 to nxvmf64.
3754 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3755 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3756 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3757
3758 // Extend from nxvmbf16 to nxvmf64.
3759 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3760 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3761 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3762
3763 // Extend from nxvmf32 to nxvmf64.
3764 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3765 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3766 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3767
3768 // Bitcasts from float to integer
3769 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3770 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3771 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3772
3773 // Bitcasts from integer to float
3774 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3775 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3776 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3777
3778 // Add cost for extending to illegal -too wide- scalable vectors.
3779 // zero/sign extend are implemented by multiple unpack operations,
3780 // where each operation has a cost of 1.
3781 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3782 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3783 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3784 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3785 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3786 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3787
3788 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3789 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3790 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3791 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3792 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3793 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3794 };
3795
3796 // We have to estimate a cost of fixed length operation upon
3797 // SVE registers(operations) with the number of registers required
3798 // for a fixed type to be represented upon SVE registers.
3799 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3800 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3801 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3802 ST->useSVEForFixedLengthVectors(WiderTy)) {
3803 std::pair<InstructionCost, MVT> LT =
3804 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3805 unsigned NumElements =
3806 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3807 return AdjustCost(
3808 LT.first *
3810 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3811 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3812 CostKind, I));
3813 }
3814
3815 if (const auto *Entry = ConvertCostTableLookup(
3816 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3817 return AdjustCost(Entry->Cost);
3818
3819 static const TypeConversionCostTblEntry FP16Tbl[] = {
3820 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3821 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3822 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3823 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3824 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3825 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3826 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3827 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3828 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3829 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3830 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3831 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3832 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3833 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3834 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3835 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3836 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3837 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3838 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3839 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3840 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3841 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3842 };
3843
3844 if (ST->hasFullFP16())
3845 if (const auto *Entry = ConvertCostTableLookup(
3846 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3847 return AdjustCost(Entry->Cost);
3848
3849 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3850 // double-rounding issues.
3851 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3852 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3854 return AdjustCost(
3856 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3857 CCH, CostKind) +
3859 CostKind) +
3861 CostKind));
3862
3863 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3865 ST->isSVEorStreamingSVEAvailable() &&
3866 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3868 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3870 // The standard behaviour in the backend for these cases is to split the
3871 // extend up into two parts:
3872 // 1. Perform an extending load or masked load up to the legal type.
3873 // 2. Extend the loaded data to the final type.
3874 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3875 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3877 Opcode, LegalTy, Src, CCH, CostKind, I);
3879 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3880 return Part1 + Part2;
3881 }
3882
3883 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3884 // but we also want to include the TTI::CastContextHint::Masked case too.
3885 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3887 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3889
3890 return AdjustCost(
3891 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3892}
3893
3896 VectorType *VecTy, unsigned Index,
3898
3899 // Make sure we were given a valid extend opcode.
3900 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3901 "Invalid opcode");
3902
3903 // We are extending an element we extract from a vector, so the source type
3904 // of the extend is the element type of the vector.
3905 auto *Src = VecTy->getElementType();
3906
3907 // Sign- and zero-extends are for integer types only.
3908 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3909
3910 // Get the cost for the extract. We compute the cost (if any) for the extend
3911 // below.
3912 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3913 CostKind, Index, nullptr, nullptr);
3914
3915 // Legalize the types.
3916 auto VecLT = getTypeLegalizationCost(VecTy);
3917 auto DstVT = TLI->getValueType(DL, Dst);
3918 auto SrcVT = TLI->getValueType(DL, Src);
3919
3920 // If the resulting type is still a vector and the destination type is legal,
3921 // we may get the extension for free. If not, get the default cost for the
3922 // extend.
3923 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3924 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3925 CostKind);
3926
3927 // The destination type should be larger than the element type. If not, get
3928 // the default cost for the extend.
3929 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3930 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3931 CostKind);
3932
3933 switch (Opcode) {
3934 default:
3935 llvm_unreachable("Opcode should be either SExt or ZExt");
3936
3937 // For sign-extends, we only need a smov, which performs the extension
3938 // automatically.
3939 case Instruction::SExt:
3940 return Cost;
3941
3942 // For zero-extends, the extend is performed automatically by a umov unless
3943 // the destination type is i64 and the element type is i8 or i16.
3944 case Instruction::ZExt:
3945 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3946 return Cost;
3947 }
3948
3949 // If we are unable to perform the extend for free, get the default cost.
3950 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3951 CostKind);
3952}
3953
3956 const Instruction *I) const {
3958 return Opcode == Instruction::PHI ? 0 : 1;
3959 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3960 // Branches are assumed to be predicted.
3961 return 0;
3962}
3963
3964InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3965 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3966 const Instruction *I, Value *Scalar,
3967 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3968 assert(Val->isVectorTy() && "This must be a vector type");
3969
3970 if (Index != -1U) {
3971 // Legalize the type.
3972 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3973
3974 // This type is legalized to a scalar type.
3975 if (!LT.second.isVector())
3976 return 0;
3977
3978 // The type may be split. For fixed-width vectors we can normalize the
3979 // index to the new type.
3980 if (LT.second.isFixedLengthVector()) {
3981 unsigned Width = LT.second.getVectorNumElements();
3982 Index = Index % Width;
3983 }
3984
3985 // The element at index zero is already inside the vector.
3986 // - For a insert-element or extract-element
3987 // instruction that extracts integers, an explicit FPR -> GPR move is
3988 // needed. So it has non-zero cost.
3989 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3990 return 0;
3991
3992 // This is recognising a LD1 single-element structure to one lane of one
3993 // register instruction. I.e., if this is an `insertelement` instruction,
3994 // and its second operand is a load, then we will generate a LD1, which
3995 // are expensive instructions.
3996 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3997 return CostKind == TTI::TCK_CodeSize
3998 ? 0
4000
4001 // i1 inserts and extract will include an extra cset or cmp of the vector
4002 // value. Increase the cost by 1 to account.
4003 if (Val->getScalarSizeInBits() == 1)
4004 return CostKind == TTI::TCK_CodeSize
4005 ? 2
4007
4008 // FIXME:
4009 // If the extract-element and insert-element instructions could be
4010 // simplified away (e.g., could be combined into users by looking at use-def
4011 // context), they have no cost. This is not done in the first place for
4012 // compile-time considerations.
4013 }
4014
4015 // In case of Neon, if there exists extractelement from lane != 0 such that
4016 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4017 // 2. extractelement result feeds into fmul.
4018 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4019 // equivalent to 0.
4020 // then the extractelement can be merged with fmul in the backend and it
4021 // incurs no cost.
4022 // e.g.
4023 // define double @foo(<2 x double> %a) {
4024 // %1 = extractelement <2 x double> %a, i32 0
4025 // %2 = extractelement <2 x double> %a, i32 1
4026 // %res = fmul double %1, %2
4027 // ret double %res
4028 // }
4029 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4030 auto ExtractCanFuseWithFmul = [&]() {
4031 // We bail out if the extract is from lane 0.
4032 if (Index == 0)
4033 return false;
4034
4035 // Check if the scalar element type of the vector operand of ExtractElement
4036 // instruction is one of the allowed types.
4037 auto IsAllowedScalarTy = [&](const Type *T) {
4038 return T->isFloatTy() || T->isDoubleTy() ||
4039 (T->isHalfTy() && ST->hasFullFP16());
4040 };
4041
4042 // Check if the extractelement user is scalar fmul.
4043 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4044 // Check if the user is scalar fmul.
4045 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4046 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4047 !BO->getType()->isVectorTy();
4048 };
4049
4050 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4051 // certain scalar type and a certain vector register width.
4052 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4053 auto RegWidth =
4055 .getFixedValue();
4056 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4057 };
4058
4059 // Check if the type constraints on input vector type and result scalar type
4060 // of extractelement instruction are satisfied.
4061 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4062 return false;
4063
4064 if (Scalar) {
4065 DenseMap<User *, unsigned> UserToExtractIdx;
4066 for (auto *U : Scalar->users()) {
4067 if (!IsUserFMulScalarTy(U))
4068 return false;
4069 // Recording entry for the user is important. Index value is not
4070 // important.
4071 UserToExtractIdx[U];
4072 }
4073 if (UserToExtractIdx.empty())
4074 return false;
4075 for (auto &[S, U, L] : ScalarUserAndIdx) {
4076 for (auto *U : S->users()) {
4077 if (UserToExtractIdx.contains(U)) {
4078 auto *FMul = cast<BinaryOperator>(U);
4079 auto *Op0 = FMul->getOperand(0);
4080 auto *Op1 = FMul->getOperand(1);
4081 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4082 UserToExtractIdx[U] = L;
4083 break;
4084 }
4085 }
4086 }
4087 }
4088 for (auto &[U, L] : UserToExtractIdx) {
4089 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4090 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4091 return false;
4092 }
4093 } else {
4094 const auto *EE = cast<ExtractElementInst>(I);
4095
4096 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4097 if (!IdxOp)
4098 return false;
4099
4100 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4101 if (!IsUserFMulScalarTy(U))
4102 return false;
4103
4104 // Check if the other operand of extractelement is also extractelement
4105 // from lane equivalent to 0.
4106 const auto *BO = cast<BinaryOperator>(U);
4107 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4108 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4109 if (OtherEE) {
4110 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4111 if (!IdxOp)
4112 return false;
4113 return IsExtractLaneEquivalentToZero(
4114 cast<ConstantInt>(OtherEE->getIndexOperand())
4115 ->getValue()
4116 .getZExtValue(),
4117 OtherEE->getType()->getScalarSizeInBits());
4118 }
4119 return true;
4120 });
4121 }
4122 return true;
4123 };
4124
4125 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4126 ExtractCanFuseWithFmul())
4127 return 0;
4128
4129 // All other insert/extracts cost this much.
4130 return CostKind == TTI::TCK_CodeSize ? 1
4131 : ST->getVectorInsertExtractBaseCost();
4132}
4133
4136 unsigned Index,
4137 const Value *Op0,
4138 const Value *Op1) const {
4139 // Treat insert at lane 0 into a poison vector as having zero cost. This
4140 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4141 // single dup) are treated as cheap.
4142 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4143 isa<PoisonValue>(Op0))
4144 return 0;
4145 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4146}
4147
4149 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4150 Value *Scalar,
4151 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4152 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4153 ScalarUserAndIdx);
4154}
4155
4157 Type *Val,
4159 unsigned Index) const {
4160 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4161}
4162
4166 unsigned Index) const {
4167 if (isa<FixedVectorType>(Val))
4169 Index);
4170
4171 // This typically requires both while and lastb instructions in order
4172 // to extract the last element. If this is in a loop the while
4173 // instruction can at least be hoisted out, although it will consume a
4174 // predicate register. The cost should be more expensive than the base
4175 // extract cost, which is 2 for most CPUs.
4176 return CostKind == TTI::TCK_CodeSize
4177 ? 2
4178 : ST->getVectorInsertExtractBaseCost() + 1;
4179}
4180
4182 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4183 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4184 ArrayRef<Value *> VL) const {
4187 if (Ty->getElementType()->isFloatingPointTy())
4188 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4189 CostKind);
4190 unsigned VecInstCost =
4191 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4192 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4193}
4194
4195std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4197 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4198 std::function<InstructionCost(Type *)> InstCost) const {
4199 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4200 return std::nullopt;
4201 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4202 return std::nullopt;
4203 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4204 ST->isNonStreamingSVEorSME2Available())
4205 return std::nullopt;
4206
4207 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4208 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4210 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4211 Cost *= 2;
4212 Cost += InstCost(PromotedTy);
4213 if (IncludeTrunc)
4214 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4216 return Cost;
4217}
4218
4220 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4222 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4223
4224 // The code-generator is currently not able to handle scalable vectors
4225 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4226 // it. This change will be removed when code-generation for these types is
4227 // sufficiently reliable.
4228 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4229 if (VTy->getElementCount() == ElementCount::getScalable(1))
4231
4232 // TODO: Handle more cost kinds.
4234 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4235 Op2Info, Args, CxtI);
4236
4237 // Legalize the type.
4238 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4239 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4240
4241 // Increase the cost for half and bfloat types if not architecturally
4242 // supported.
4243 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4244 ISD == ISD::FDIV || ISD == ISD::FREM)
4245 if (auto PromotedCost = getFP16BF16PromoteCost(
4246 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4247 // There is not native support for fdiv/frem even with +sve-b16b16.
4248 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4249 [&](Type *PromotedTy) {
4250 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4251 Op1Info, Op2Info);
4252 }))
4253 return *PromotedCost;
4254
4255 // If the operation is a widening instruction (smull or umull) and both
4256 // operands are extends the cost can be cheaper by considering that the
4257 // operation will operate on the narrowest type size possible (double the
4258 // largest input size) and a further extend.
4259 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4260 if (ExtTy != Ty)
4261 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4262 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4264 return LT.first;
4265 }
4266
4267 switch (ISD) {
4268 default:
4269 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4270 Op2Info);
4271 case ISD::SREM:
4272 case ISD::SDIV:
4273 /*
4274 Notes for sdiv/srem specific costs:
4275 1. This only considers the cases where the divisor is constant, uniform and
4276 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4277 result in some form of (ldr + adrp), corresponding to constant vectors, or
4278 scalarization of the division operation.
4279 2. Constant divisors, either negative in whole or partially, don't result in
4280 significantly different codegen as compared to positive constant divisors.
4281 So, we don't consider negative divisors separately.
4282 3. If the codegen is significantly different with SVE, it has been indicated
4283 using comments at appropriate places.
4284
4285 sdiv specific cases:
4286 -----------------------------------------------------------------------
4287 codegen | pow-of-2 | Type
4288 -----------------------------------------------------------------------
4289 add + cmp + csel + asr | Y | i64
4290 add + cmp + csel + asr | Y | i32
4291 -----------------------------------------------------------------------
4292
4293 srem specific cases:
4294 -----------------------------------------------------------------------
4295 codegen | pow-of-2 | Type
4296 -----------------------------------------------------------------------
4297 negs + and + and + csneg | Y | i64
4298 negs + and + and + csneg | Y | i32
4299 -----------------------------------------------------------------------
4300
4301 other sdiv/srem cases:
4302 -------------------------------------------------------------------------
4303 common codegen | + srem | + sdiv | pow-of-2 | Type
4304 -------------------------------------------------------------------------
4305 smulh + asr + add + add | - | - | N | i64
4306 smull + lsr + add + add | - | - | N | i32
4307 usra | and + sub | sshr | Y | <2 x i64>
4308 2 * (scalar code) | - | - | N | <2 x i64>
4309 usra | bic + sub | sshr + neg | Y | <4 x i32>
4310 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4311 + sshr + usra | | | |
4312 -------------------------------------------------------------------------
4313 */
4314 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4315 InstructionCost AddCost =
4316 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4317 Op1Info.getNoProps(), Op2Info.getNoProps());
4318 InstructionCost AsrCost =
4319 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4320 Op1Info.getNoProps(), Op2Info.getNoProps());
4321 InstructionCost MulCost =
4322 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4323 Op1Info.getNoProps(), Op2Info.getNoProps());
4324 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4325 // have similar cost.
4326 auto VT = TLI->getValueType(DL, Ty);
4327 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4328 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4329 // Neg can be folded into the asr instruction.
4330 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4331 : (3 * AsrCost + AddCost);
4332 } else {
4333 return MulCost + AsrCost + 2 * AddCost;
4334 }
4335 } else if (VT.isVector()) {
4336 InstructionCost UsraCost = 2 * AsrCost;
4337 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4338 // Division with scalable types corresponds to native 'asrd'
4339 // instruction when SVE is available.
4340 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4341
4342 // One more for the negation in SDIV
4344 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4345 if (Ty->isScalableTy() && ST->hasSVE())
4346 Cost += 2 * AsrCost;
4347 else {
4348 Cost +=
4349 UsraCost +
4350 (ISD == ISD::SDIV
4351 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4352 : 2 * AddCost);
4353 }
4354 return Cost;
4355 } else if (LT.second == MVT::v2i64) {
4356 return VT.getVectorNumElements() *
4357 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4358 Op1Info.getNoProps(),
4359 Op2Info.getNoProps());
4360 } else {
4361 // When SVE is available, we get:
4362 // smulh + lsr + add/sub + asr + add/sub.
4363 if (Ty->isScalableTy() && ST->hasSVE())
4364 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4365 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4366 }
4367 }
4368 }
4369 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4370 LT.second.isFixedLengthVector()) {
4371 // FIXME: When the constant vector is non-uniform, this may result in
4372 // loading the vector from constant pool or in some cases, may also result
4373 // in scalarization. For now, we are approximating this with the
4374 // scalarization cost.
4375 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4376 CostKind, -1, nullptr, nullptr);
4377 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4378 CostKind, -1, nullptr, nullptr);
4379 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4380 return ExtractCost + InsertCost +
4381 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4382 CostKind, Op1Info.getNoProps(),
4383 Op2Info.getNoProps());
4384 }
4385 [[fallthrough]];
4386 case ISD::UDIV:
4387 case ISD::UREM: {
4388 auto VT = TLI->getValueType(DL, Ty);
4389 if (Op2Info.isConstant()) {
4390 // If the operand is a power of 2 we can use the shift or and cost.
4391 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4392 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4393 Op1Info.getNoProps(),
4394 Op2Info.getNoProps());
4395 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4396 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4397 Op1Info.getNoProps(),
4398 Op2Info.getNoProps());
4399
4400 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4401 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4402 // The MULHU will be expanded to UMULL for the types not listed below,
4403 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4404 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4405 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4406 LT.second == MVT::nxv16i8;
4407 bool Is128bit = LT.second.is128BitVector();
4408
4409 InstructionCost MulCost =
4410 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4411 Op1Info.getNoProps(), Op2Info.getNoProps());
4412 InstructionCost AddCost =
4413 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4414 Op1Info.getNoProps(), Op2Info.getNoProps());
4415 InstructionCost ShrCost =
4416 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4417 Op1Info.getNoProps(), Op2Info.getNoProps());
4418 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4419 (HasMULH ? 0 : ShrCost) + // UMULL shift
4420 AddCost * 2 + ShrCost;
4421 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4422 }
4423 }
4424
4425 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4426 // emitted by the backend even when those functions are not declared in the
4427 // module.
4428 if (!VT.isVector() && VT.getSizeInBits() > 64)
4429 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4430
4432 Opcode, Ty, CostKind, Op1Info, Op2Info);
4433 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4434 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4435 // SDIV/UDIV operations are lowered using SVE, then we can have less
4436 // costs.
4437 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4438 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4439 static const CostTblEntry DivTbl[]{
4440 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4441 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4442 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4443 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4444 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4445 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4446
4447 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4448 if (nullptr != Entry)
4449 return Entry->Cost;
4450 }
4451 // For 8/16-bit elements, the cost is higher because the type
4452 // requires promotion and possibly splitting:
4453 if (LT.second.getScalarType() == MVT::i8)
4454 Cost *= 8;
4455 else if (LT.second.getScalarType() == MVT::i16)
4456 Cost *= 4;
4457 return Cost;
4458 } else {
4459 // If one of the operands is a uniform constant then the cost for each
4460 // element is Cost for insertion, extraction and division.
4461 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4462 // operation with scalar type
4463 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4464 (Op2Info.isConstant() && Op2Info.isUniform())) {
4465 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4467 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4468 return (4 + DivCost) * VTy->getNumElements();
4469 }
4470 }
4471 // On AArch64, without SVE, vector divisions are expanded
4472 // into scalar divisions of each pair of elements.
4473 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4474 -1, nullptr, nullptr);
4475 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4476 nullptr, nullptr);
4477 }
4478
4479 // TODO: if one of the arguments is scalar, then it's not necessary to
4480 // double the cost of handling the vector elements.
4481 Cost += Cost;
4482 }
4483 return Cost;
4484 }
4485 case ISD::MUL:
4486 // When SVE is available, then we can lower the v2i64 operation using
4487 // the SVE mul instruction, which has a lower cost.
4488 if (LT.second == MVT::v2i64 && ST->hasSVE())
4489 return LT.first;
4490
4491 // When SVE is not available, there is no MUL.2d instruction,
4492 // which means mul <2 x i64> is expensive as elements are extracted
4493 // from the vectors and the muls scalarized.
4494 // As getScalarizationOverhead is a bit too pessimistic, we
4495 // estimate the cost for a i64 vector directly here, which is:
4496 // - four 2-cost i64 extracts,
4497 // - two 2-cost i64 inserts, and
4498 // - two 1-cost muls.
4499 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4500 // LT.first = 2 the cost is 28.
4501 if (LT.second != MVT::v2i64)
4502 return LT.first;
4503 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4504 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4505 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4506 nullptr, nullptr) *
4507 2 +
4508 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4509 nullptr, nullptr));
4510 case ISD::ADD:
4511 case ISD::XOR:
4512 case ISD::OR:
4513 case ISD::AND:
4514 case ISD::SRL:
4515 case ISD::SRA:
4516 case ISD::SHL:
4517 // These nodes are marked as 'custom' for combining purposes only.
4518 // We know that they are legal. See LowerAdd in ISelLowering.
4519 return LT.first;
4520
4521 case ISD::FNEG:
4522 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4523 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4524 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4525 CxtI &&
4526 ((CxtI->hasOneUse() &&
4527 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4528 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4529 return 0;
4530 [[fallthrough]];
4531 case ISD::FADD:
4532 case ISD::FSUB:
4533 if (!Ty->getScalarType()->isFP128Ty())
4534 return LT.first;
4535 [[fallthrough]];
4536 case ISD::FMUL:
4537 case ISD::FDIV:
4538 // These nodes are marked as 'custom' just to lower them to SVE.
4539 // We know said lowering will incur no additional cost.
4540 if (!Ty->getScalarType()->isFP128Ty())
4541 return 2 * LT.first;
4542
4543 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4544 Op2Info);
4545 case ISD::FREM:
4546 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4547 // those functions are not declared in the module.
4548 if (!Ty->isVectorTy())
4549 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4550 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4551 Op2Info);
4552 }
4553}
4554
4557 const SCEV *Ptr,
4559 // Address computations in vectorized code with non-consecutive addresses will
4560 // likely result in more instructions compared to scalar code where the
4561 // computation can more often be merged into the index mode. The resulting
4562 // extra micro-ops can significantly decrease throughput.
4563 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4564 int MaxMergeDistance = 64;
4565
4566 if (PtrTy->isVectorTy() && SE &&
4567 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4568 return NumVectorInstToHideOverhead;
4569
4570 // In many cases the address computation is not merged into the instruction
4571 // addressing mode.
4572 return 1;
4573}
4574
4575/// Check whether Opcode1 has less throughput according to the scheduling
4576/// model than Opcode2.
4578 unsigned Opcode1, unsigned Opcode2) const {
4579 const MCSchedModel &Sched = ST->getSchedModel();
4580 const TargetInstrInfo *TII = ST->getInstrInfo();
4581 if (!Sched.hasInstrSchedModel())
4582 return false;
4583
4584 const MCSchedClassDesc *SCD1 =
4585 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4586 const MCSchedClassDesc *SCD2 =
4587 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4588 // We cannot handle variant scheduling classes without an MI. If we need to
4589 // support them for any of the instructions we query the information of we
4590 // might need to add a way to resolve them without a MI or not use the
4591 // scheduling info.
4592 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4593 "Cannot handle variant scheduling classes without an MI");
4594 if (!SCD1->isValid() || !SCD2->isValid())
4595 return false;
4596
4597 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4599}
4600
4602 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4604 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4605 // We don't lower some vector selects well that are wider than the register
4606 // width. TODO: Improve this with different cost kinds.
4607 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4608 // We would need this many instructions to hide the scalarization happening.
4609 const int AmortizationCost = 20;
4610
4611 // If VecPred is not set, check if we can get a predicate from the context
4612 // instruction, if its type matches the requested ValTy.
4613 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4614 CmpPredicate CurrentPred;
4615 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4616 m_Value())))
4617 VecPred = CurrentPred;
4618 }
4619 // Check if we have a compare/select chain that can be lowered using
4620 // a (F)CMxx & BFI pair.
4621 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4622 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4623 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4624 VecPred == CmpInst::FCMP_UNE) {
4625 static const auto ValidMinMaxTys = {
4626 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4627 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4628 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4629
4630 auto LT = getTypeLegalizationCost(ValTy);
4631 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4632 (ST->hasFullFP16() &&
4633 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4634 return LT.first;
4635 }
4636
4637 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4638 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4639 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4640 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4641 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4642 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4643 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4644 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4645 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4646 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4647 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4648 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4649
4650 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4651 EVT SelValTy = TLI->getValueType(DL, ValTy);
4652 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4653 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4654 SelCondTy.getSimpleVT(),
4655 SelValTy.getSimpleVT()))
4656 return Entry->Cost;
4657 }
4658 }
4659
4660 if (Opcode == Instruction::FCmp) {
4661 if (auto PromotedCost = getFP16BF16PromoteCost(
4662 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4663 // TODO: Consider costing SVE FCMPs.
4664 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4666 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4667 CostKind, Op1Info, Op2Info);
4668 if (isa<VectorType>(PromotedTy))
4670 Instruction::Trunc,
4674 return Cost;
4675 }))
4676 return *PromotedCost;
4677
4678 auto LT = getTypeLegalizationCost(ValTy);
4679 // Model unknown fp compares as a libcall.
4680 if (LT.second.getScalarType() != MVT::f64 &&
4681 LT.second.getScalarType() != MVT::f32 &&
4682 LT.second.getScalarType() != MVT::f16)
4683 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4684 {ValTy, ValTy}, CostKind);
4685
4686 // Some comparison operators require expanding to multiple compares + or.
4687 unsigned Factor = 1;
4688 if (!CondTy->isVectorTy() &&
4689 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4690 Factor = 2; // fcmp with 2 selects
4691 else if (isa<FixedVectorType>(ValTy) &&
4692 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4693 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4694 Factor = 3; // fcmxx+fcmyy+or
4695 else if (isa<ScalableVectorType>(ValTy) &&
4696 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4697 Factor = 3; // fcmxx+fcmyy+or
4698
4699 if (isa<ScalableVectorType>(ValTy) &&
4701 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4702 AArch64::FCMEQv4f32))
4703 Factor *= 2;
4704
4705 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4706 }
4707
4708 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4709 // icmp(and, 0) as free, as we can make use of ands, but only if the
4710 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4711 // providing it will not cause performance regressions.
4712 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4713 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4714 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4715 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4716 if (match(I->getOperand(1), m_Zero()))
4717 return 0;
4718
4719 // x >= 1 / x < 1 -> x > 0 / x <= 0
4720 if (match(I->getOperand(1), m_One()) &&
4721 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4722 return 0;
4723
4724 // x <= -1 / x > -1 -> x > 0 / x <= 0
4725 if (match(I->getOperand(1), m_AllOnes()) &&
4726 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4727 return 0;
4728 }
4729
4730 // The base case handles scalable vectors fine for now, since it treats the
4731 // cost as 1 * legalization cost.
4732 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4733 Op1Info, Op2Info, I);
4734}
4735
4737AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4739 if (ST->requiresStrictAlign()) {
4740 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4741 // a bunch of instructions when strict align is enabled.
4742 return Options;
4743 }
4744 Options.AllowOverlappingLoads = true;
4745 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4746 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4747 // TODO: Though vector loads usually perform well on AArch64, in some targets
4748 // they may wake up the FP unit, which raises the power consumption. Perhaps
4749 // they could be used with no holds barred (-O3).
4750 Options.LoadSizes = {8, 4, 2, 1};
4751 Options.AllowedTailExpansions = {3, 5, 6};
4752 return Options;
4753}
4754
4756 return ST->hasSVE();
4757}
4758
4762 switch (MICA.getID()) {
4763 case Intrinsic::masked_scatter:
4764 case Intrinsic::masked_gather:
4765 return getGatherScatterOpCost(MICA, CostKind);
4766 case Intrinsic::masked_load:
4767 case Intrinsic::masked_store:
4768 return getMaskedMemoryOpCost(MICA, CostKind);
4769 }
4771}
4772
4776 Type *Src = MICA.getDataType();
4777
4778 if (useNeonVector(Src))
4780 auto LT = getTypeLegalizationCost(Src);
4781 if (!LT.first.isValid())
4783
4784 // Return an invalid cost for element types that we are unable to lower.
4785 auto *VT = cast<VectorType>(Src);
4786 if (VT->getElementType()->isIntegerTy(1))
4788
4789 // The code-generator is currently not able to handle scalable vectors
4790 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4791 // it. This change will be removed when code-generation for these types is
4792 // sufficiently reliable.
4793 if (VT->getElementCount() == ElementCount::getScalable(1))
4795
4796 return LT.first;
4797}
4798
4799// This function returns gather/scatter overhead either from
4800// user-provided value or specialized values per-target from \p ST.
4801static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4802 const AArch64Subtarget *ST) {
4803 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4804 "Should be called on only load or stores.");
4805 switch (Opcode) {
4806 case Instruction::Load:
4807 if (SVEGatherOverhead.getNumOccurrences() > 0)
4808 return SVEGatherOverhead;
4809 return ST->getGatherOverhead();
4810 break;
4811 case Instruction::Store:
4812 if (SVEScatterOverhead.getNumOccurrences() > 0)
4813 return SVEScatterOverhead;
4814 return ST->getScatterOverhead();
4815 break;
4816 default:
4817 llvm_unreachable("Shouldn't have reached here");
4818 }
4819}
4820
4824
4825 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4826 MICA.getID() == Intrinsic::vp_gather)
4827 ? Instruction::Load
4828 : Instruction::Store;
4829
4830 Type *DataTy = MICA.getDataType();
4831 Align Alignment = MICA.getAlignment();
4832 const Instruction *I = MICA.getInst();
4833
4834 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4836 auto *VT = cast<VectorType>(DataTy);
4837 auto LT = getTypeLegalizationCost(DataTy);
4838 if (!LT.first.isValid())
4840
4841 // Return an invalid cost for element types that we are unable to lower.
4842 if (!LT.second.isVector() ||
4843 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4844 VT->getElementType()->isIntegerTy(1))
4846
4847 // The code-generator is currently not able to handle scalable vectors
4848 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4849 // it. This change will be removed when code-generation for these types is
4850 // sufficiently reliable.
4851 if (VT->getElementCount() == ElementCount::getScalable(1))
4853
4854 ElementCount LegalVF = LT.second.getVectorElementCount();
4855 InstructionCost MemOpCost =
4856 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4857 {TTI::OK_AnyValue, TTI::OP_None}, I);
4858 // Add on an overhead cost for using gathers/scatters.
4859 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4860 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4861}
4862
4864 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4865}
4866
4868 Align Alignment,
4869 unsigned AddressSpace,
4871 TTI::OperandValueInfo OpInfo,
4872 const Instruction *I) const {
4873 EVT VT = TLI->getValueType(DL, Ty, true);
4874 // Type legalization can't handle structs
4875 if (VT == MVT::Other)
4876 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4877 CostKind);
4878
4879 auto LT = getTypeLegalizationCost(Ty);
4880 if (!LT.first.isValid())
4882
4883 // The code-generator is currently not able to handle scalable vectors
4884 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4885 // it. This change will be removed when code-generation for these types is
4886 // sufficiently reliable.
4887 // We also only support full register predicate loads and stores.
4888 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4889 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4890 (VTy->getElementType()->isIntegerTy(1) &&
4891 !VTy->getElementCount().isKnownMultipleOf(
4894
4895 // TODO: consider latency as well for TCK_SizeAndLatency.
4897 return LT.first;
4898
4900 return 1;
4901
4902 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4903 LT.second.is128BitVector() && Alignment < Align(16)) {
4904 // Unaligned stores are extremely inefficient. We don't split all
4905 // unaligned 128-bit stores because the negative impact that has shown in
4906 // practice on inlined block copy code.
4907 // We make such stores expensive so that we will only vectorize if there
4908 // are 6 other instructions getting vectorized.
4909 const int AmortizationCost = 6;
4910
4911 return LT.first * 2 * AmortizationCost;
4912 }
4913
4914 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4915 if (Ty->isPtrOrPtrVectorTy())
4916 return LT.first;
4917
4918 if (useNeonVector(Ty)) {
4919 // Check truncating stores and extending loads.
4920 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4921 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4922 if (VT == MVT::v4i8)
4923 return 2;
4924 // Otherwise we need to scalarize.
4925 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4926 }
4927 EVT EltVT = VT.getVectorElementType();
4928 unsigned EltSize = EltVT.getScalarSizeInBits();
4929 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4930 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4931 return LT.first;
4932 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4933 // widening to v4i8, which produces suboptimal results.
4934 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4935 return LT.first;
4936
4937 // Check non-power-of-2 loads/stores for legal vector element types with
4938 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4939 // operations on smaller power-of-2 ops, including ld1/st1.
4940 LLVMContext &C = Ty->getContext();
4942 SmallVector<EVT> TypeWorklist;
4943 TypeWorklist.push_back(VT);
4944 while (!TypeWorklist.empty()) {
4945 EVT CurrVT = TypeWorklist.pop_back_val();
4946 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4947 if (isPowerOf2_32(CurrNumElements)) {
4948 Cost += 1;
4949 continue;
4950 }
4951
4952 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4953 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4954 TypeWorklist.push_back(
4955 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4956 }
4957 return Cost;
4958 }
4959
4960 return LT.first;
4961}
4962
4964 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4965 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4966 bool UseMaskForCond, bool UseMaskForGaps) const {
4967 assert(Factor >= 2 && "Invalid interleave factor");
4968 auto *VecVTy = cast<VectorType>(VecTy);
4969
4970 if (VecTy->isScalableTy() && !ST->hasSVE())
4972
4973 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4974 // only have lowering for power-of-2 factors.
4975 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4976 // InterleavedAccessPass for ld3/st3
4977 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4979
4980 // Vectorization for masked interleaved accesses is only enabled for scalable
4981 // VF.
4982 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4984
4985 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4986 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4987 auto *SubVecTy =
4988 VectorType::get(VecVTy->getElementType(),
4989 VecVTy->getElementCount().divideCoefficientBy(Factor));
4990
4991 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4992 // Accesses having vector types that are a multiple of 128 bits can be
4993 // matched to more than one ldN/stN instruction.
4994 bool UseScalable;
4995 if (MinElts % Factor == 0 &&
4996 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4997 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4998 }
4999
5000 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5001 Alignment, AddressSpace, CostKind,
5002 UseMaskForCond, UseMaskForGaps);
5003}
5004
5009 for (auto *I : Tys) {
5010 if (!I->isVectorTy())
5011 continue;
5012 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5013 128)
5014 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5015 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5016 }
5017 return Cost;
5018}
5019
5021 return ST->getMaxInterleaveFactor();
5022}
5023
5024// For Falkor, we want to avoid having too many strided loads in a loop since
5025// that can exhaust the HW prefetcher resources. We adjust the unroller
5026// MaxCount preference below to attempt to ensure unrolling doesn't create too
5027// many strided loads.
5028static void
5031 enum { MaxStridedLoads = 7 };
5032 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5033 int StridedLoads = 0;
5034 // FIXME? We could make this more precise by looking at the CFG and
5035 // e.g. not counting loads in each side of an if-then-else diamond.
5036 for (const auto BB : L->blocks()) {
5037 for (auto &I : *BB) {
5038 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5039 if (!LMemI)
5040 continue;
5041
5042 Value *PtrValue = LMemI->getPointerOperand();
5043 if (L->isLoopInvariant(PtrValue))
5044 continue;
5045
5046 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5047 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5048 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5049 continue;
5050
5051 // FIXME? We could take pairing of unrolled load copies into account
5052 // by looking at the AddRec, but we would probably have to limit this
5053 // to loops with no stores or other memory optimization barriers.
5054 ++StridedLoads;
5055 // We've seen enough strided loads that seeing more won't make a
5056 // difference.
5057 if (StridedLoads > MaxStridedLoads / 2)
5058 return StridedLoads;
5059 }
5060 }
5061 return StridedLoads;
5062 };
5063
5064 int StridedLoads = countStridedLoads(L, SE);
5065 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5066 << " strided loads\n");
5067 // Pick the largest power of 2 unroll count that won't result in too many
5068 // strided loads.
5069 if (StridedLoads) {
5070 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5071 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5072 << UP.MaxCount << '\n');
5073 }
5074}
5075
5076// This function returns true if the loop:
5077// 1. Has a valid cost, and
5078// 2. Has a cost within the supplied budget.
5079// Otherwise it returns false.
5081 InstructionCost Budget,
5082 unsigned *FinalSize) {
5083 // Estimate the size of the loop.
5084 InstructionCost LoopCost = 0;
5085
5086 for (auto *BB : L->getBlocks()) {
5087 for (auto &I : *BB) {
5088 SmallVector<const Value *, 4> Operands(I.operand_values());
5089 InstructionCost Cost =
5090 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5091 // This can happen with intrinsics that don't currently have a cost model
5092 // or for some operations that require SVE.
5093 if (!Cost.isValid())
5094 return false;
5095
5096 LoopCost += Cost;
5097 if (LoopCost > Budget)
5098 return false;
5099 }
5100 }
5101
5102 if (FinalSize)
5103 *FinalSize = LoopCost.getValue();
5104 return true;
5105}
5106
5108 const AArch64TTIImpl &TTI) {
5109 // Only consider loops with unknown trip counts for which we can determine
5110 // a symbolic expression. Multi-exit loops with small known trip counts will
5111 // likely be unrolled anyway.
5112 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5114 return false;
5115
5116 // It might not be worth unrolling loops with low max trip counts. Restrict
5117 // this to max trip counts > 32 for now.
5118 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5119 if (MaxTC > 0 && MaxTC <= 32)
5120 return false;
5121
5122 // Make sure the loop size is <= 5.
5123 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5124 return false;
5125
5126 // Small search loops with multiple exits can be highly beneficial to unroll.
5127 // We only care about loops with exactly two exiting blocks, although each
5128 // block could jump to the same exit block.
5129 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5130 if (Blocks.size() != 2)
5131 return false;
5132
5133 if (any_of(Blocks, [](BasicBlock *BB) {
5134 return !isa<BranchInst>(BB->getTerminator());
5135 }))
5136 return false;
5137
5138 return true;
5139}
5140
5141/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5142/// OOO engine's wide instruction window and various predictors.
5143static void
5146 const AArch64TTIImpl &TTI) {
5147 // Limit loops with structure that is highly likely to benefit from runtime
5148 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5149 // likely with complex control flow). Note that the heuristics here may be
5150 // overly conservative and we err on the side of avoiding runtime unrolling
5151 // rather than unroll excessively. They are all subject to further refinement.
5152 if (!L->isInnermost() || L->getNumBlocks() > 8)
5153 return;
5154
5155 // Loops with multiple exits are handled by common code.
5156 if (!L->getExitBlock())
5157 return;
5158
5159 // Check if the loop contains any reductions that could be parallelized when
5160 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5161 // a multiple of 2.
5162 bool HasParellelizableReductions =
5163 L->getNumBlocks() == 1 &&
5164 any_of(L->getHeader()->phis(),
5165 [&SE, L](PHINode &Phi) {
5166 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5167 }) &&
5168 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5169 if (HasParellelizableReductions &&
5170 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5171 UP.Partial = true;
5172 UP.MaxCount = 4;
5173 UP.AddAdditionalAccumulators = true;
5174 }
5175
5176 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5178 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5179 SE.getSmallConstantMaxTripCount(L) <= 32))
5180 return;
5181
5182 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5183 return;
5184
5186 return;
5187
5188 // Limit to loops with trip counts that are cheap to expand.
5189 UP.SCEVExpansionBudget = 1;
5190
5191 if (HasParellelizableReductions) {
5192 UP.Runtime = true;
5194 UP.AddAdditionalAccumulators = true;
5195 }
5196
5197 // Try to unroll small loops, of few-blocks with low budget, if they have
5198 // load/store dependencies, to expose more parallel memory access streams,
5199 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5200 BasicBlock *Header = L->getHeader();
5201 BasicBlock *Latch = L->getLoopLatch();
5202 if (Header == Latch) {
5203 // Estimate the size of the loop.
5204 unsigned Size;
5205 unsigned Width = 10;
5206 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5207 return;
5208
5209 // Try to find an unroll count that maximizes the use of the instruction
5210 // window, i.e. trying to fetch as many instructions per cycle as possible.
5211 unsigned MaxInstsPerLine = 16;
5212 unsigned UC = 1;
5213 unsigned BestUC = 1;
5214 unsigned SizeWithBestUC = BestUC * Size;
5215 while (UC <= 8) {
5216 unsigned SizeWithUC = UC * Size;
5217 if (SizeWithUC > 48)
5218 break;
5219 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5220 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5221 BestUC = UC;
5222 SizeWithBestUC = BestUC * Size;
5223 }
5224 UC++;
5225 }
5226
5227 if (BestUC == 1)
5228 return;
5229
5230 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5232 for (auto *BB : L->blocks()) {
5233 for (auto &I : *BB) {
5235 if (!Ptr)
5236 continue;
5237 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5238 if (SE.isLoopInvariant(PtrSCEV, L))
5239 continue;
5240 if (isa<LoadInst>(&I)) {
5241 LoadedValuesPlus.insert(&I);
5242 // Include in-loop 1st users of loaded values.
5243 for (auto *U : I.users())
5244 if (L->contains(cast<Instruction>(U)))
5245 LoadedValuesPlus.insert(U);
5246 } else
5247 Stores.push_back(cast<StoreInst>(&I));
5248 }
5249 }
5250
5251 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5252 return LoadedValuesPlus.contains(SI->getOperand(0));
5253 }))
5254 return;
5255
5256 UP.Runtime = true;
5257 UP.DefaultUnrollRuntimeCount = BestUC;
5258 return;
5259 }
5260
5261 // Try to runtime-unroll loops with early-continues depending on loop-varying
5262 // loads; this helps with branch-prediction for the early-continues.
5263 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5265 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5266 !llvm::is_contained(Preds, Header) ||
5267 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5268 return;
5269
5270 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5271 [&](Instruction *I, unsigned Depth) -> bool {
5272 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5273 return false;
5274
5275 if (isa<LoadInst>(I))
5276 return true;
5277
5278 return any_of(I->operands(), [&](Value *V) {
5279 auto *I = dyn_cast<Instruction>(V);
5280 return I && DependsOnLoopLoad(I, Depth + 1);
5281 });
5282 };
5283 CmpPredicate Pred;
5284 Instruction *I;
5285 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5286 m_Value())) &&
5287 DependsOnLoopLoad(I, 0)) {
5288 UP.Runtime = true;
5289 }
5290}
5291
5294 OptimizationRemarkEmitter *ORE) const {
5295 // Enable partial unrolling and runtime unrolling.
5296 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5297
5298 UP.UpperBound = true;
5299
5300 // For inner loop, it is more likely to be a hot one, and the runtime check
5301 // can be promoted out from LICM pass, so the overhead is less, let's try
5302 // a larger threshold to unroll more loops.
5303 if (L->getLoopDepth() > 1)
5304 UP.PartialThreshold *= 2;
5305
5306 // Disable partial & runtime unrolling on -Os.
5308
5309 // Scan the loop: don't unroll loops with calls as this could prevent
5310 // inlining. Don't unroll auto-vectorized loops either, though do allow
5311 // unrolling of the scalar remainder.
5312 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5314 for (auto *BB : L->getBlocks()) {
5315 for (auto &I : *BB) {
5316 // Both auto-vectorized loops and the scalar remainder have the
5317 // isvectorized attribute, so differentiate between them by the presence
5318 // of vector instructions.
5319 if (IsVectorized && I.getType()->isVectorTy())
5320 return;
5321 if (isa<CallBase>(I)) {
5324 if (!isLoweredToCall(F))
5325 continue;
5326 return;
5327 }
5328
5329 SmallVector<const Value *, 4> Operands(I.operand_values());
5330 Cost += getInstructionCost(&I, Operands,
5332 }
5333 }
5334
5335 // Apply subtarget-specific unrolling preferences.
5336 if (ST->isAppleMLike())
5337 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5338 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5341
5342 // If this is a small, multi-exit loop similar to something like std::find,
5343 // then there is typically a performance improvement achieved by unrolling.
5344 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5345 UP.RuntimeUnrollMultiExit = true;
5346 UP.Runtime = true;
5347 // Limit unroll count.
5349 // Allow slightly more costly trip-count expansion to catch search loops
5350 // with pointer inductions.
5351 UP.SCEVExpansionBudget = 5;
5352 return;
5353 }
5354
5355 // Enable runtime unrolling for in-order models
5356 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5357 // checking for that case, we can ensure that the default behaviour is
5358 // unchanged
5359 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5360 !ST->getSchedModel().isOutOfOrder()) {
5361 UP.Runtime = true;
5362 UP.Partial = true;
5363 UP.UnrollRemainder = true;
5365
5366 UP.UnrollAndJam = true;
5368 }
5369
5370 // Force unrolling small loops can be very useful because of the branch
5371 // taken cost of the backedge.
5373 UP.Force = true;
5374}
5375
5380
5382 Type *ExpectedType,
5383 bool CanCreate) const {
5384 switch (Inst->getIntrinsicID()) {
5385 default:
5386 return nullptr;
5387 case Intrinsic::aarch64_neon_st2:
5388 case Intrinsic::aarch64_neon_st3:
5389 case Intrinsic::aarch64_neon_st4: {
5390 // Create a struct type
5391 StructType *ST = dyn_cast<StructType>(ExpectedType);
5392 if (!CanCreate || !ST)
5393 return nullptr;
5394 unsigned NumElts = Inst->arg_size() - 1;
5395 if (ST->getNumElements() != NumElts)
5396 return nullptr;
5397 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5398 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5399 return nullptr;
5400 }
5401 Value *Res = PoisonValue::get(ExpectedType);
5402 IRBuilder<> Builder(Inst);
5403 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5404 Value *L = Inst->getArgOperand(i);
5405 Res = Builder.CreateInsertValue(Res, L, i);
5406 }
5407 return Res;
5408 }
5409 case Intrinsic::aarch64_neon_ld2:
5410 case Intrinsic::aarch64_neon_ld3:
5411 case Intrinsic::aarch64_neon_ld4:
5412 if (Inst->getType() == ExpectedType)
5413 return Inst;
5414 return nullptr;
5415 }
5416}
5417
5419 MemIntrinsicInfo &Info) const {
5420 switch (Inst->getIntrinsicID()) {
5421 default:
5422 break;
5423 case Intrinsic::aarch64_neon_ld2:
5424 case Intrinsic::aarch64_neon_ld3:
5425 case Intrinsic::aarch64_neon_ld4:
5426 Info.ReadMem = true;
5427 Info.WriteMem = false;
5428 Info.PtrVal = Inst->getArgOperand(0);
5429 break;
5430 case Intrinsic::aarch64_neon_st2:
5431 case Intrinsic::aarch64_neon_st3:
5432 case Intrinsic::aarch64_neon_st4:
5433 Info.ReadMem = false;
5434 Info.WriteMem = true;
5435 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5436 break;
5437 }
5438
5439 switch (Inst->getIntrinsicID()) {
5440 default:
5441 return false;
5442 case Intrinsic::aarch64_neon_ld2:
5443 case Intrinsic::aarch64_neon_st2:
5444 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5445 break;
5446 case Intrinsic::aarch64_neon_ld3:
5447 case Intrinsic::aarch64_neon_st3:
5448 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5449 break;
5450 case Intrinsic::aarch64_neon_ld4:
5451 case Intrinsic::aarch64_neon_st4:
5452 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5453 break;
5454 }
5455 return true;
5456}
5457
5458/// See if \p I should be considered for address type promotion. We check if \p
5459/// I is a sext with right type and used in memory accesses. If it used in a
5460/// "complex" getelementptr, we allow it to be promoted without finding other
5461/// sext instructions that sign extended the same initial value. A getelementptr
5462/// is considered as "complex" if it has more than 2 operands.
5464 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5465 bool Considerable = false;
5466 AllowPromotionWithoutCommonHeader = false;
5467 if (!isa<SExtInst>(&I))
5468 return false;
5469 Type *ConsideredSExtType =
5470 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5471 if (I.getType() != ConsideredSExtType)
5472 return false;
5473 // See if the sext is the one with the right type and used in at least one
5474 // GetElementPtrInst.
5475 for (const User *U : I.users()) {
5476 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5477 Considerable = true;
5478 // A getelementptr is considered as "complex" if it has more than 2
5479 // operands. We will promote a SExt used in such complex GEP as we
5480 // expect some computation to be merged if they are done on 64 bits.
5481 if (GEPInst->getNumOperands() > 2) {
5482 AllowPromotionWithoutCommonHeader = true;
5483 break;
5484 }
5485 }
5486 }
5487 return Considerable;
5488}
5489
5491 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5492 if (!VF.isScalable())
5493 return true;
5494
5495 Type *Ty = RdxDesc.getRecurrenceType();
5496 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5497 return false;
5498
5499 switch (RdxDesc.getRecurrenceKind()) {
5500 case RecurKind::Sub:
5502 case RecurKind::Add:
5503 case RecurKind::FAdd:
5504 case RecurKind::And:
5505 case RecurKind::Or:
5506 case RecurKind::Xor:
5507 case RecurKind::SMin:
5508 case RecurKind::SMax:
5509 case RecurKind::UMin:
5510 case RecurKind::UMax:
5511 case RecurKind::FMin:
5512 case RecurKind::FMax:
5513 case RecurKind::FMulAdd:
5514 case RecurKind::AnyOf:
5515 return true;
5516 default:
5517 return false;
5518 }
5519}
5520
5523 FastMathFlags FMF,
5525 // The code-generator is currently not able to handle scalable vectors
5526 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5527 // it. This change will be removed when code-generation for these types is
5528 // sufficiently reliable.
5529 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5530 if (VTy->getElementCount() == ElementCount::getScalable(1))
5532
5533 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5534
5535 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5536 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5537
5538 InstructionCost LegalizationCost = 0;
5539 if (LT.first > 1) {
5540 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5541 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5542 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5543 }
5544
5545 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5546}
5547
5549 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5550 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5551 InstructionCost LegalizationCost = 0;
5552 if (LT.first > 1) {
5553 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5554 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5555 LegalizationCost *= LT.first - 1;
5556 }
5557
5558 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5559 assert(ISD && "Invalid opcode");
5560 // Add the final reduction cost for the legal horizontal reduction
5561 switch (ISD) {
5562 case ISD::ADD:
5563 case ISD::AND:
5564 case ISD::OR:
5565 case ISD::XOR:
5566 case ISD::FADD:
5567 return LegalizationCost + 2;
5568 default:
5570 }
5571}
5572
5575 std::optional<FastMathFlags> FMF,
5577 // The code-generator is currently not able to handle scalable vectors
5578 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5579 // it. This change will be removed when code-generation for these types is
5580 // sufficiently reliable.
5581 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5582 if (VTy->getElementCount() == ElementCount::getScalable(1))
5584
5586 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5587 InstructionCost BaseCost =
5588 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5589 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5590 // end up vectorizing for more computationally intensive loops.
5591 return BaseCost + FixedVTy->getNumElements();
5592 }
5593
5594 if (Opcode != Instruction::FAdd)
5596
5597 auto *VTy = cast<ScalableVectorType>(ValTy);
5599 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5600 Cost *= getMaxNumElements(VTy->getElementCount());
5601 return Cost;
5602 }
5603
5604 if (isa<ScalableVectorType>(ValTy))
5605 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5606
5607 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5608 MVT MTy = LT.second;
5609 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5610 assert(ISD && "Invalid opcode");
5611
5612 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5613 // instructions as twice a normal vector add, plus 1 for each legalization
5614 // step (LT.first). This is the only arithmetic vector reduction operation for
5615 // which we have an instruction.
5616 // OR, XOR and AND costs should match the codegen from:
5617 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5618 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5619 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5620 static const CostTblEntry CostTblNoPairwise[]{
5621 {ISD::ADD, MVT::v8i8, 2},
5622 {ISD::ADD, MVT::v16i8, 2},
5623 {ISD::ADD, MVT::v4i16, 2},
5624 {ISD::ADD, MVT::v8i16, 2},
5625 {ISD::ADD, MVT::v2i32, 2},
5626 {ISD::ADD, MVT::v4i32, 2},
5627 {ISD::ADD, MVT::v2i64, 2},
5628 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5629 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5630 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5631 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5632 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5633 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5634 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5635 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5636 {ISD::XOR, MVT::v16i8, 7},
5637 {ISD::XOR, MVT::v4i16, 4},
5638 {ISD::XOR, MVT::v8i16, 6},
5639 {ISD::XOR, MVT::v2i32, 3},
5640 {ISD::XOR, MVT::v4i32, 5},
5641 {ISD::XOR, MVT::v2i64, 3},
5642 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5643 {ISD::AND, MVT::v16i8, 7},
5644 {ISD::AND, MVT::v4i16, 4},
5645 {ISD::AND, MVT::v8i16, 6},
5646 {ISD::AND, MVT::v2i32, 3},
5647 {ISD::AND, MVT::v4i32, 5},
5648 {ISD::AND, MVT::v2i64, 3},
5649 };
5650 switch (ISD) {
5651 default:
5652 break;
5653 case ISD::FADD:
5654 if (Type *EltTy = ValTy->getScalarType();
5655 // FIXME: For half types without fullfp16 support, this could extend and
5656 // use a fp32 faddp reduction but current codegen unrolls.
5657 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5658 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5659 const unsigned NElts = MTy.getVectorNumElements();
5660 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5661 isPowerOf2_32(NElts))
5662 // Reduction corresponding to series of fadd instructions is lowered to
5663 // series of faddp instructions. faddp has latency/throughput that
5664 // matches fadd instruction and hence, every faddp instruction can be
5665 // considered to have a relative cost = 1 with
5666 // CostKind = TCK_RecipThroughput.
5667 // An faddp will pairwise add vector elements, so the size of input
5668 // vector reduces by half every time, requiring
5669 // #(faddp instructions) = log2_32(NElts).
5670 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5671 }
5672 break;
5673 case ISD::ADD:
5674 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5675 return (LT.first - 1) + Entry->Cost;
5676 break;
5677 case ISD::XOR:
5678 case ISD::AND:
5679 case ISD::OR:
5680 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5681 if (!Entry)
5682 break;
5683 auto *ValVTy = cast<FixedVectorType>(ValTy);
5684 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5685 isPowerOf2_32(ValVTy->getNumElements())) {
5686 InstructionCost ExtraCost = 0;
5687 if (LT.first != 1) {
5688 // Type needs to be split, so there is an extra cost of LT.first - 1
5689 // arithmetic ops.
5690 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5691 MTy.getVectorNumElements());
5692 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5693 ExtraCost *= LT.first - 1;
5694 }
5695 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5696 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5697 return Cost + ExtraCost;
5698 }
5699 break;
5700 }
5701 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5702}
5703
5705 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5706 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5707 EVT VecVT = TLI->getValueType(DL, VecTy);
5708 EVT ResVT = TLI->getValueType(DL, ResTy);
5709
5710 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5711 VecVT.getSizeInBits() >= 64) {
5712 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5713
5714 // The legal cases are:
5715 // UADDLV 8/16/32->32
5716 // UADDLP 32->64
5717 unsigned RevVTSize = ResVT.getSizeInBits();
5718 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5719 RevVTSize <= 32) ||
5720 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5721 RevVTSize <= 32) ||
5722 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5723 RevVTSize <= 64))
5724 return (LT.first - 1) * 2 + 2;
5725 }
5726
5727 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5728 CostKind);
5729}
5730
5732AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5733 Type *ResTy, VectorType *VecTy,
5735 EVT VecVT = TLI->getValueType(DL, VecTy);
5736 EVT ResVT = TLI->getValueType(DL, ResTy);
5737
5738 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5739 RedOpcode == Instruction::Add) {
5740 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5741
5742 // The legal cases with dotprod are
5743 // UDOT 8->32
5744 // Which requires an additional uaddv to sum the i32 values.
5745 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5746 ResVT == MVT::i32)
5747 return LT.first + 2;
5748 }
5749
5750 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5751 CostKind);
5752}
5753
5757 static const CostTblEntry ShuffleTbl[] = {
5758 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5759 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5760 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5761 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5762 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5763 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5764 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5765 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5766 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5767 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5768 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5769 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5770 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5771 };
5772
5773 // The code-generator is currently not able to handle scalable vectors
5774 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5775 // it. This change will be removed when code-generation for these types is
5776 // sufficiently reliable.
5779
5780 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5781 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5782 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5783 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5784 : LT.second;
5785 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5786 InstructionCost LegalizationCost = 0;
5787 if (Index < 0) {
5788 LegalizationCost =
5789 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5791 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5793 }
5794
5795 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5796 // Cost performed on a promoted type.
5797 if (LT.second.getScalarType() == MVT::i1) {
5798 LegalizationCost +=
5799 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5801 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5803 }
5804 const auto *Entry =
5805 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5806 assert(Entry && "Illegal Type for Splice");
5807 LegalizationCost += Entry->Cost;
5808 return LegalizationCost * LT.first;
5809}
5810
5812 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5814 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5817
5819 return Invalid;
5820
5821 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5822 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5823 return Invalid;
5824
5825 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5826 OpAExtend == TTI::PR_None)
5827 return Invalid;
5828
5829 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5830 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5831 "Unexpected values for OpBExtend or InputTypeB");
5832
5833 // We only support multiply binary operations for now, and for muls we
5834 // require the types being extended to be the same.
5835 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5836 return Invalid;
5837
5838 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5839 if (IsUSDot && !ST->hasMatMulInt8())
5840 return Invalid;
5841
5842 unsigned Ratio =
5843 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5844 if (VF.getKnownMinValue() <= Ratio)
5845 return Invalid;
5846
5847 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5848 VectorType *AccumVectorType =
5849 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5850 // We don't yet support all kinds of legalization.
5851 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5852 EVT::getEVT(AccumVectorType));
5853 switch (TC.first) {
5854 default:
5855 return Invalid;
5859 // The legalised type (e.g. after splitting) must be legal too.
5860 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5862 return Invalid;
5863 break;
5864 }
5865
5866 std::pair<InstructionCost, MVT> AccumLT =
5867 getTypeLegalizationCost(AccumVectorType);
5868 std::pair<InstructionCost, MVT> InputLT =
5869 getTypeLegalizationCost(InputVectorType);
5870
5871 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5872
5873 // Prefer using full types by costing half-full input types as more expensive.
5874 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5876 // FIXME: This can be removed after the cost of the extends are folded into
5877 // the dot-product expression in VPlan, after landing:
5878 // https://github.com/llvm/llvm-project/pull/147302
5879 Cost *= 2;
5880
5881 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5882 // i16 -> i64 is natively supported for udot/sdot
5883 if (AccumLT.second.getScalarType() == MVT::i64 &&
5884 InputLT.second.getScalarType() == MVT::i16)
5885 return Cost;
5886 // i8 -> i64 is supported with an extra level of extends
5887 if (AccumLT.second.getScalarType() == MVT::i64 &&
5888 InputLT.second.getScalarType() == MVT::i8)
5889 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5890 // because it requires two extra extends on the inputs. But if we'd change
5891 // that now, a regular reduction would be cheaper because the costs of
5892 // the extends in the IR are still counted. This can be fixed
5893 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5894 return Cost;
5895 }
5896
5897 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5898 if (ST->isSVEorStreamingSVEAvailable() ||
5899 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5900 ST->hasDotProd())) {
5901 if (AccumLT.second.getScalarType() == MVT::i32 &&
5902 InputLT.second.getScalarType() == MVT::i8)
5903 return Cost;
5904 }
5905
5906 // Add additional cost for the extends that would need to be inserted.
5907 return Cost + 2;
5908}
5909
5912 VectorType *SrcTy, ArrayRef<int> Mask,
5913 TTI::TargetCostKind CostKind, int Index,
5915 const Instruction *CxtI) const {
5916 assert((Mask.empty() || DstTy->isScalableTy() ||
5917 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5918 "Expected the Mask to match the return size if given");
5919 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5920 "Expected the same scalar types");
5921 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5922
5923 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5924 // into smaller vectors and sum the cost of each shuffle.
5925 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5926 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5927 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5928 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5929 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5930 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5931 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5932 // cost than just the load.
5933 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5936 return std::max<InstructionCost>(1, LT.first / 4);
5937
5938 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5939 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5940 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5941 // cost than just the store.
5942 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5944 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5946 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5947 return LT.first;
5948
5949 unsigned TpNumElts = Mask.size();
5950 unsigned LTNumElts = LT.second.getVectorNumElements();
5951 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5952 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5953 LT.second.getVectorElementCount());
5955 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5956 PreviousCosts;
5957 for (unsigned N = 0; N < NumVecs; N++) {
5958 SmallVector<int> NMask;
5959 // Split the existing mask into chunks of size LTNumElts. Track the source
5960 // sub-vectors to ensure the result has at most 2 inputs.
5961 unsigned Source1 = -1U, Source2 = -1U;
5962 unsigned NumSources = 0;
5963 for (unsigned E = 0; E < LTNumElts; E++) {
5964 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5966 if (MaskElt < 0) {
5968 continue;
5969 }
5970
5971 // Calculate which source from the input this comes from and whether it
5972 // is new to us.
5973 unsigned Source = MaskElt / LTNumElts;
5974 if (NumSources == 0) {
5975 Source1 = Source;
5976 NumSources = 1;
5977 } else if (NumSources == 1 && Source != Source1) {
5978 Source2 = Source;
5979 NumSources = 2;
5980 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5981 NumSources++;
5982 }
5983
5984 // Add to the new mask. For the NumSources>2 case these are not correct,
5985 // but are only used for the modular lane number.
5986 if (Source == Source1)
5987 NMask.push_back(MaskElt % LTNumElts);
5988 else if (Source == Source2)
5989 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5990 else
5991 NMask.push_back(MaskElt % LTNumElts);
5992 }
5993 // Check if we have already generated this sub-shuffle, which means we
5994 // will have already generated the output. For example a <16 x i32> splat
5995 // will be the same sub-splat 4 times, which only needs to be generated
5996 // once and reused.
5997 auto Result =
5998 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5999 // Check if it was already in the map (already costed).
6000 if (!Result.second)
6001 continue;
6002 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6003 // getShuffleCost. If not then cost it using the worst case as the number
6004 // of element moves into a new vector.
6005 InstructionCost NCost =
6006 NumSources <= 2
6007 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6009 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6010 CxtI)
6011 : LTNumElts;
6012 Result.first->second = NCost;
6013 Cost += NCost;
6014 }
6015 return Cost;
6016 }
6017
6018 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6019 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6020 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6021 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6022 // This currently only handles low or high extracts to prevent SLP vectorizer
6023 // regressions.
6024 // Note that SVE's ext instruction is destructive, but it can be fused with
6025 // a movprfx to act like a constructive instruction.
6026 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6027 if (LT.second.getFixedSizeInBits() >= 128 &&
6028 cast<FixedVectorType>(SubTp)->getNumElements() ==
6029 LT.second.getVectorNumElements() / 2) {
6030 if (Index == 0)
6031 return 0;
6032 if (Index == (int)LT.second.getVectorNumElements() / 2)
6033 return 1;
6034 }
6036 }
6037 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6038 // the code to handle length-changing shuffles.
6039 if (Kind == TTI::SK_InsertSubvector) {
6040 LT = getTypeLegalizationCost(DstTy);
6041 SrcTy = DstTy;
6042 }
6043
6044 // Check for identity masks, which we can treat as free for both fixed and
6045 // scalable vector paths.
6046 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6047 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6048 all_of(enumerate(Mask), [](const auto &M) {
6049 return M.value() < 0 || M.value() == (int)M.index();
6050 }))
6051 return 0;
6052
6053 // Segmented shuffle matching.
6054 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6055 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6056 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6058
6060 unsigned Segments =
6062 unsigned SegmentElts = VTy->getNumElements() / Segments;
6063
6064 // dupq zd.t, zn.t[idx]
6065 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6066 ST->isSVEorStreamingSVEAvailable() &&
6067 isDUPQMask(Mask, Segments, SegmentElts))
6068 return LT.first;
6069
6070 // mov zd.q, vn
6071 if (ST->isSVEorStreamingSVEAvailable() &&
6072 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6073 return LT.first;
6074 }
6075
6076 // Check for broadcast loads, which are supported by the LD1R instruction.
6077 // In terms of code-size, the shuffle vector is free when a load + dup get
6078 // folded into a LD1R. That's what we check and return here. For performance
6079 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6080 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6081 // that we model the load + dup sequence slightly higher because LD1R is a
6082 // high latency instruction.
6083 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6084 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6085 if (IsLoad && LT.second.isVector() &&
6086 isLegalBroadcastLoad(SrcTy->getElementType(),
6087 LT.second.getVectorElementCount()))
6088 return 0;
6089 }
6090
6091 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6092 // from the perfect shuffle tables.
6093 if (Mask.size() == 4 &&
6094 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6095 (SrcTy->getScalarSizeInBits() == 16 ||
6096 SrcTy->getScalarSizeInBits() == 32) &&
6097 all_of(Mask, [](int E) { return E < 8; }))
6098 return getPerfectShuffleCost(Mask);
6099
6100 // Check for other shuffles that are not SK_ kinds but we have native
6101 // instructions for, for example ZIP and UZP.
6102 unsigned Unused;
6103 if (LT.second.isFixedLengthVector() &&
6104 LT.second.getVectorNumElements() == Mask.size() &&
6105 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6106 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6107 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6108 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6109 LT.second.getVectorNumElements(), 16) ||
6110 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6111 LT.second.getVectorNumElements(), 32) ||
6112 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6113 LT.second.getVectorNumElements(), 64) ||
6114 // Check for non-zero lane splats
6115 all_of(drop_begin(Mask),
6116 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6117 return 1;
6118
6119 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6120 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6121 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6122 static const CostTblEntry ShuffleTbl[] = {
6123 // Broadcast shuffle kinds can be performed with 'dup'.
6124 {TTI::SK_Broadcast, MVT::v8i8, 1},
6125 {TTI::SK_Broadcast, MVT::v16i8, 1},
6126 {TTI::SK_Broadcast, MVT::v4i16, 1},
6127 {TTI::SK_Broadcast, MVT::v8i16, 1},
6128 {TTI::SK_Broadcast, MVT::v2i32, 1},
6129 {TTI::SK_Broadcast, MVT::v4i32, 1},
6130 {TTI::SK_Broadcast, MVT::v2i64, 1},
6131 {TTI::SK_Broadcast, MVT::v4f16, 1},
6132 {TTI::SK_Broadcast, MVT::v8f16, 1},
6133 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6134 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6135 {TTI::SK_Broadcast, MVT::v2f32, 1},
6136 {TTI::SK_Broadcast, MVT::v4f32, 1},
6137 {TTI::SK_Broadcast, MVT::v2f64, 1},
6138 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6139 // 'zip1/zip2' instructions.
6140 {TTI::SK_Transpose, MVT::v8i8, 1},
6141 {TTI::SK_Transpose, MVT::v16i8, 1},
6142 {TTI::SK_Transpose, MVT::v4i16, 1},
6143 {TTI::SK_Transpose, MVT::v8i16, 1},
6144 {TTI::SK_Transpose, MVT::v2i32, 1},
6145 {TTI::SK_Transpose, MVT::v4i32, 1},
6146 {TTI::SK_Transpose, MVT::v2i64, 1},
6147 {TTI::SK_Transpose, MVT::v4f16, 1},
6148 {TTI::SK_Transpose, MVT::v8f16, 1},
6149 {TTI::SK_Transpose, MVT::v4bf16, 1},
6150 {TTI::SK_Transpose, MVT::v8bf16, 1},
6151 {TTI::SK_Transpose, MVT::v2f32, 1},
6152 {TTI::SK_Transpose, MVT::v4f32, 1},
6153 {TTI::SK_Transpose, MVT::v2f64, 1},
6154 // Select shuffle kinds.
6155 // TODO: handle vXi8/vXi16.
6156 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6157 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6158 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6159 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6160 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6161 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6162 // PermuteSingleSrc shuffle kinds.
6163 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6164 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6165 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6166 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6167 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6168 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6169 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6170 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6171 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6172 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6173 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6174 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6175 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6176 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6177 // Reverse can be lowered with `rev`.
6178 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6179 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6180 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6181 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6182 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6183 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6184 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6185 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6186 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6187 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6188 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6189 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6190 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6191 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6192 // Splice can all be lowered as `ext`.
6193 {TTI::SK_Splice, MVT::v2i32, 1},
6194 {TTI::SK_Splice, MVT::v4i32, 1},
6195 {TTI::SK_Splice, MVT::v2i64, 1},
6196 {TTI::SK_Splice, MVT::v2f32, 1},
6197 {TTI::SK_Splice, MVT::v4f32, 1},
6198 {TTI::SK_Splice, MVT::v2f64, 1},
6199 {TTI::SK_Splice, MVT::v8f16, 1},
6200 {TTI::SK_Splice, MVT::v8bf16, 1},
6201 {TTI::SK_Splice, MVT::v8i16, 1},
6202 {TTI::SK_Splice, MVT::v16i8, 1},
6203 {TTI::SK_Splice, MVT::v4f16, 1},
6204 {TTI::SK_Splice, MVT::v4bf16, 1},
6205 {TTI::SK_Splice, MVT::v4i16, 1},
6206 {TTI::SK_Splice, MVT::v8i8, 1},
6207 // Broadcast shuffle kinds for scalable vectors
6208 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6209 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6210 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6211 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6212 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6213 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6214 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6215 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6216 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6217 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6218 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6219 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6220 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6221 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6222 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6223 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6224 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6225 // Handle the cases for vector.reverse with scalable vectors
6226 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6227 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6228 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6229 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6230 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6231 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6232 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6233 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6234 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6235 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6236 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6237 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6238 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6239 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6240 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6241 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6242 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6243 };
6244 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6245 return LT.first * Entry->Cost;
6246 }
6247
6248 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6249 return getSpliceCost(SrcTy, Index, CostKind);
6250
6251 // Inserting a subvector can often be done with either a D, S or H register
6252 // move, so long as the inserted vector is "aligned".
6253 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6254 LT.second.getSizeInBits() <= 128 && SubTp) {
6255 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6256 if (SubLT.second.isVector()) {
6257 int NumElts = LT.second.getVectorNumElements();
6258 int NumSubElts = SubLT.second.getVectorNumElements();
6259 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6260 return SubLT.first;
6261 }
6262 }
6263
6264 // Restore optimal kind.
6265 if (IsExtractSubvector)
6267 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6268 Args, CxtI);
6269}
6270
6273 const DominatorTree &DT) {
6274 const auto &Strides = DenseMap<Value *, const SCEV *>();
6275 for (BasicBlock *BB : TheLoop->blocks()) {
6276 // Scan the instructions in the block and look for addresses that are
6277 // consecutive and decreasing.
6278 for (Instruction &I : *BB) {
6279 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6281 Type *AccessTy = getLoadStoreType(&I);
6282 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6283 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6284 .value_or(0) < 0)
6285 return true;
6286 }
6287 }
6288 }
6289 return false;
6290}
6291
6293 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6295 // For cases like post-LTO vectorization, when we eventually know the trip
6296 // count, epilogue with fixed-width vectorization can be deleted if the trip
6297 // count is less than the epilogue iterations. That's why we prefer
6298 // fixed-width vectorization in epilogue in case of equal costs.
6299 if (IsEpilogue)
6300 return true;
6301 return ST->useFixedOverScalableIfEqualCost();
6302}
6303
6305 return ST->getEpilogueVectorizationMinVF();
6306}
6307
6309 if (!ST->hasSVE())
6310 return false;
6311
6312 // We don't currently support vectorisation with interleaving for SVE - with
6313 // such loops we're better off not using tail-folding. This gives us a chance
6314 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6315 if (TFI->IAI->hasGroups())
6316 return false;
6317
6319 if (TFI->LVL->getReductionVars().size())
6321 if (TFI->LVL->getFixedOrderRecurrences().size())
6323
6324 // We call this to discover whether any load/store pointers in the loop have
6325 // negative strides. This will require extra work to reverse the loop
6326 // predicate, which may be expensive.
6329 *TFI->LVL->getDominatorTree()))
6333
6334 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6335 Required))
6336 return false;
6337
6338 // Don't tail-fold for tight loops where we would be better off interleaving
6339 // with an unpredicated loop.
6340 unsigned NumInsns = 0;
6341 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6342 NumInsns += BB->sizeWithoutDebug();
6343 }
6344
6345 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6346 return NumInsns >= SVETailFoldInsnThreshold;
6347}
6348
6351 StackOffset BaseOffset, bool HasBaseReg,
6352 int64_t Scale, unsigned AddrSpace) const {
6353 // Scaling factors are not free at all.
6354 // Operands | Rt Latency
6355 // -------------------------------------------
6356 // Rt, [Xn, Xm] | 4
6357 // -------------------------------------------
6358 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6359 // Rt, [Xn, Wm, <extend> #imm] |
6361 AM.BaseGV = BaseGV;
6362 AM.BaseOffs = BaseOffset.getFixed();
6363 AM.HasBaseReg = HasBaseReg;
6364 AM.Scale = Scale;
6365 AM.ScalableOffset = BaseOffset.getScalable();
6366 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6367 // Scale represents reg2 * scale, thus account for 1 if
6368 // it is not equal to 0 or 1.
6369 return AM.Scale != 0 && AM.Scale != 1;
6371}
6372
6374 const Instruction *I) const {
6376 // For the binary operators (e.g. or) we need to be more careful than
6377 // selects, here we only transform them if they are already at a natural
6378 // break point in the code - the end of a block with an unconditional
6379 // terminator.
6380 if (I->getOpcode() == Instruction::Or &&
6381 isa<BranchInst>(I->getNextNode()) &&
6382 cast<BranchInst>(I->getNextNode())->isUnconditional())
6383 return true;
6384
6385 if (I->getOpcode() == Instruction::Add ||
6386 I->getOpcode() == Instruction::Sub)
6387 return true;
6388 }
6390}
6391
6394 const TargetTransformInfo::LSRCost &C2) const {
6395 // AArch64 specific here is adding the number of instructions to the
6396 // comparison (though not as the first consideration, as some targets do)
6397 // along with changing the priority of the base additions.
6398 // TODO: Maybe a more nuanced tradeoff between instruction count
6399 // and number of registers? To be investigated at a later date.
6400 if (EnableLSRCostOpt)
6401 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6402 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6403 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6404 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6405
6407}
6408
6409static bool isSplatShuffle(Value *V) {
6410 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6411 return all_equal(Shuf->getShuffleMask());
6412 return false;
6413}
6414
6415/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6416/// or upper half of the vector elements.
6417static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6418 bool AllowSplat = false) {
6419 // Scalable types can't be extract shuffle vectors.
6420 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6421 return false;
6422
6423 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6424 auto *FullTy = FullV->getType();
6425 auto *HalfTy = HalfV->getType();
6426 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6427 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6428 };
6429
6430 auto extractHalf = [](Value *FullV, Value *HalfV) {
6431 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6432 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6433 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6434 };
6435
6436 ArrayRef<int> M1, M2;
6437 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6438 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6439 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6440 return false;
6441
6442 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6443 // it is not checked as an extract below.
6444 if (AllowSplat && isSplatShuffle(Op1))
6445 S1Op1 = nullptr;
6446 if (AllowSplat && isSplatShuffle(Op2))
6447 S2Op1 = nullptr;
6448
6449 // Check that the operands are half as wide as the result and we extract
6450 // half of the elements of the input vectors.
6451 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6452 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6453 return false;
6454
6455 // Check the mask extracts either the lower or upper half of vector
6456 // elements.
6457 int M1Start = 0;
6458 int M2Start = 0;
6459 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6460 if ((S1Op1 &&
6461 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6462 (S2Op1 &&
6463 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6464 return false;
6465
6466 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6467 (M2Start != 0 && M2Start != (NumElements / 2)))
6468 return false;
6469 if (S1Op1 && S2Op1 && M1Start != M2Start)
6470 return false;
6471
6472 return true;
6473}
6474
6475/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6476/// of the vector elements.
6477static bool areExtractExts(Value *Ext1, Value *Ext2) {
6478 auto areExtDoubled = [](Instruction *Ext) {
6479 return Ext->getType()->getScalarSizeInBits() ==
6480 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6481 };
6482
6483 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6484 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6485 !areExtDoubled(cast<Instruction>(Ext1)) ||
6486 !areExtDoubled(cast<Instruction>(Ext2)))
6487 return false;
6488
6489 return true;
6490}
6491
6492/// Check if Op could be used with vmull_high_p64 intrinsic.
6494 Value *VectorOperand = nullptr;
6495 ConstantInt *ElementIndex = nullptr;
6496 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6497 m_ConstantInt(ElementIndex))) &&
6498 ElementIndex->getValue() == 1 &&
6499 isa<FixedVectorType>(VectorOperand->getType()) &&
6500 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6501}
6502
6503/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6504static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6506}
6507
6509 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6510 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6511 if (!GEP || GEP->getNumOperands() != 2)
6512 return false;
6513
6514 Value *Base = GEP->getOperand(0);
6515 Value *Offsets = GEP->getOperand(1);
6516
6517 // We only care about scalar_base+vector_offsets.
6518 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6519 return false;
6520
6521 // Sink extends that would allow us to use 32-bit offset vectors.
6522 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6523 auto *OffsetsInst = cast<Instruction>(Offsets);
6524 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6525 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6526 Ops.push_back(&GEP->getOperandUse(1));
6527 }
6528
6529 // Sink the GEP.
6530 return true;
6531}
6532
6533/// We want to sink following cases:
6534/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6535/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6537 if (match(Op, m_VScale()))
6538 return true;
6539 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6541 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6542 return true;
6543 }
6544 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6546 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6547 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6548 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6549 return true;
6550 }
6551 return false;
6552}
6553
6554/// Check if sinking \p I's operands to I's basic block is profitable, because
6555/// the operands can be folded into a target instruction, e.g.
6556/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6560 switch (II->getIntrinsicID()) {
6561 case Intrinsic::aarch64_neon_smull:
6562 case Intrinsic::aarch64_neon_umull:
6563 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6564 /*AllowSplat=*/true)) {
6565 Ops.push_back(&II->getOperandUse(0));
6566 Ops.push_back(&II->getOperandUse(1));
6567 return true;
6568 }
6569 [[fallthrough]];
6570
6571 case Intrinsic::fma:
6572 case Intrinsic::fmuladd:
6573 if (isa<VectorType>(I->getType()) &&
6574 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6575 !ST->hasFullFP16())
6576 return false;
6577 [[fallthrough]];
6578 case Intrinsic::aarch64_neon_sqdmull:
6579 case Intrinsic::aarch64_neon_sqdmulh:
6580 case Intrinsic::aarch64_neon_sqrdmulh:
6581 // Sink splats for index lane variants
6582 if (isSplatShuffle(II->getOperand(0)))
6583 Ops.push_back(&II->getOperandUse(0));
6584 if (isSplatShuffle(II->getOperand(1)))
6585 Ops.push_back(&II->getOperandUse(1));
6586 return !Ops.empty();
6587 case Intrinsic::aarch64_neon_fmlal:
6588 case Intrinsic::aarch64_neon_fmlal2:
6589 case Intrinsic::aarch64_neon_fmlsl:
6590 case Intrinsic::aarch64_neon_fmlsl2:
6591 // Sink splats for index lane variants
6592 if (isSplatShuffle(II->getOperand(1)))
6593 Ops.push_back(&II->getOperandUse(1));
6594 if (isSplatShuffle(II->getOperand(2)))
6595 Ops.push_back(&II->getOperandUse(2));
6596 return !Ops.empty();
6597 case Intrinsic::aarch64_sve_ptest_first:
6598 case Intrinsic::aarch64_sve_ptest_last:
6599 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6600 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6601 Ops.push_back(&II->getOperandUse(0));
6602 return !Ops.empty();
6603 case Intrinsic::aarch64_sme_write_horiz:
6604 case Intrinsic::aarch64_sme_write_vert:
6605 case Intrinsic::aarch64_sme_writeq_horiz:
6606 case Intrinsic::aarch64_sme_writeq_vert: {
6607 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6608 if (!Idx || Idx->getOpcode() != Instruction::Add)
6609 return false;
6610 Ops.push_back(&II->getOperandUse(1));
6611 return true;
6612 }
6613 case Intrinsic::aarch64_sme_read_horiz:
6614 case Intrinsic::aarch64_sme_read_vert:
6615 case Intrinsic::aarch64_sme_readq_horiz:
6616 case Intrinsic::aarch64_sme_readq_vert:
6617 case Intrinsic::aarch64_sme_ld1b_vert:
6618 case Intrinsic::aarch64_sme_ld1h_vert:
6619 case Intrinsic::aarch64_sme_ld1w_vert:
6620 case Intrinsic::aarch64_sme_ld1d_vert:
6621 case Intrinsic::aarch64_sme_ld1q_vert:
6622 case Intrinsic::aarch64_sme_st1b_vert:
6623 case Intrinsic::aarch64_sme_st1h_vert:
6624 case Intrinsic::aarch64_sme_st1w_vert:
6625 case Intrinsic::aarch64_sme_st1d_vert:
6626 case Intrinsic::aarch64_sme_st1q_vert:
6627 case Intrinsic::aarch64_sme_ld1b_horiz:
6628 case Intrinsic::aarch64_sme_ld1h_horiz:
6629 case Intrinsic::aarch64_sme_ld1w_horiz:
6630 case Intrinsic::aarch64_sme_ld1d_horiz:
6631 case Intrinsic::aarch64_sme_ld1q_horiz:
6632 case Intrinsic::aarch64_sme_st1b_horiz:
6633 case Intrinsic::aarch64_sme_st1h_horiz:
6634 case Intrinsic::aarch64_sme_st1w_horiz:
6635 case Intrinsic::aarch64_sme_st1d_horiz:
6636 case Intrinsic::aarch64_sme_st1q_horiz: {
6637 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6638 if (!Idx || Idx->getOpcode() != Instruction::Add)
6639 return false;
6640 Ops.push_back(&II->getOperandUse(3));
6641 return true;
6642 }
6643 case Intrinsic::aarch64_neon_pmull:
6644 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6645 return false;
6646 Ops.push_back(&II->getOperandUse(0));
6647 Ops.push_back(&II->getOperandUse(1));
6648 return true;
6649 case Intrinsic::aarch64_neon_pmull64:
6650 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6651 II->getArgOperand(1)))
6652 return false;
6653 Ops.push_back(&II->getArgOperandUse(0));
6654 Ops.push_back(&II->getArgOperandUse(1));
6655 return true;
6656 case Intrinsic::masked_gather:
6657 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6658 return false;
6659 Ops.push_back(&II->getArgOperandUse(0));
6660 return true;
6661 case Intrinsic::masked_scatter:
6662 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6663 return false;
6664 Ops.push_back(&II->getArgOperandUse(1));
6665 return true;
6666 default:
6667 return false;
6668 }
6669 }
6670
6671 auto ShouldSinkCondition = [](Value *Cond,
6672 SmallVectorImpl<Use *> &Ops) -> bool {
6674 return false;
6676 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6677 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6678 return false;
6679 if (isa<CmpInst>(II->getOperand(0)))
6680 Ops.push_back(&II->getOperandUse(0));
6681 return true;
6682 };
6683
6684 switch (I->getOpcode()) {
6685 case Instruction::GetElementPtr:
6686 case Instruction::Add:
6687 case Instruction::Sub:
6688 // Sink vscales closer to uses for better isel
6689 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6690 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6691 Ops.push_back(&I->getOperandUse(Op));
6692 return true;
6693 }
6694 }
6695 break;
6696 case Instruction::Select: {
6697 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6698 return false;
6699
6700 Ops.push_back(&I->getOperandUse(0));
6701 return true;
6702 }
6703 case Instruction::Br: {
6704 if (cast<BranchInst>(I)->isUnconditional())
6705 return false;
6706
6707 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6708 return false;
6709
6710 Ops.push_back(&I->getOperandUse(0));
6711 return true;
6712 }
6713 default:
6714 break;
6715 }
6716
6717 if (!I->getType()->isVectorTy())
6718 return false;
6719
6720 switch (I->getOpcode()) {
6721 case Instruction::Sub:
6722 case Instruction::Add: {
6723 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6724 return false;
6725
6726 // If the exts' operands extract either the lower or upper elements, we
6727 // can sink them too.
6728 auto Ext1 = cast<Instruction>(I->getOperand(0));
6729 auto Ext2 = cast<Instruction>(I->getOperand(1));
6730 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6731 Ops.push_back(&Ext1->getOperandUse(0));
6732 Ops.push_back(&Ext2->getOperandUse(0));
6733 }
6734
6735 Ops.push_back(&I->getOperandUse(0));
6736 Ops.push_back(&I->getOperandUse(1));
6737
6738 return true;
6739 }
6740 case Instruction::Or: {
6741 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6742 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6743 if (ST->hasNEON()) {
6744 Instruction *OtherAnd, *IA, *IB;
6745 Value *MaskValue;
6746 // MainAnd refers to And instruction that has 'Not' as one of its operands
6747 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6748 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6749 m_Instruction(IA)))))) {
6750 if (match(OtherAnd,
6751 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6752 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6753 ? cast<Instruction>(I->getOperand(1))
6754 : cast<Instruction>(I->getOperand(0));
6755
6756 // Both Ands should be in same basic block as Or
6757 if (I->getParent() != MainAnd->getParent() ||
6758 I->getParent() != OtherAnd->getParent())
6759 return false;
6760
6761 // Non-mask operands of both Ands should also be in same basic block
6762 if (I->getParent() != IA->getParent() ||
6763 I->getParent() != IB->getParent())
6764 return false;
6765
6766 Ops.push_back(
6767 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6768 Ops.push_back(&I->getOperandUse(0));
6769 Ops.push_back(&I->getOperandUse(1));
6770
6771 return true;
6772 }
6773 }
6774 }
6775
6776 return false;
6777 }
6778 case Instruction::Mul: {
6779 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6780 auto *Ty = cast<VectorType>(V->getType());
6781 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6782 if (Ty->isScalableTy())
6783 return false;
6784
6785 // Indexed variants of Mul exist for i16 and i32 element types only.
6786 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6787 };
6788
6789 int NumZExts = 0, NumSExts = 0;
6790 for (auto &Op : I->operands()) {
6791 // Make sure we are not already sinking this operand
6792 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6793 continue;
6794
6795 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6796 auto *Ext = cast<Instruction>(Op);
6797 auto *ExtOp = Ext->getOperand(0);
6798 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6799 Ops.push_back(&Ext->getOperandUse(0));
6800 Ops.push_back(&Op);
6801
6802 if (isa<SExtInst>(Ext)) {
6803 NumSExts++;
6804 } else {
6805 NumZExts++;
6806 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6807 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6808 I->getType()->getScalarSizeInBits())
6809 NumSExts++;
6810 }
6811
6812 continue;
6813 }
6814
6816 if (!Shuffle)
6817 continue;
6818
6819 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6820 // operand and the s/zext can help create indexed s/umull. This is
6821 // especially useful to prevent i64 mul being scalarized.
6822 if (isSplatShuffle(Shuffle) &&
6823 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6824 Ops.push_back(&Shuffle->getOperandUse(0));
6825 Ops.push_back(&Op);
6826 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6827 NumSExts++;
6828 else
6829 NumZExts++;
6830 continue;
6831 }
6832
6833 Value *ShuffleOperand = Shuffle->getOperand(0);
6834 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6835 if (!Insert)
6836 continue;
6837
6838 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6839 if (!OperandInstr)
6840 continue;
6841
6842 ConstantInt *ElementConstant =
6843 dyn_cast<ConstantInt>(Insert->getOperand(2));
6844 // Check that the insertelement is inserting into element 0
6845 if (!ElementConstant || !ElementConstant->isZero())
6846 continue;
6847
6848 unsigned Opcode = OperandInstr->getOpcode();
6849 if (Opcode == Instruction::SExt)
6850 NumSExts++;
6851 else if (Opcode == Instruction::ZExt)
6852 NumZExts++;
6853 else {
6854 // If we find that the top bits are known 0, then we can sink and allow
6855 // the backend to generate a umull.
6856 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6857 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6858 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6859 continue;
6860 NumZExts++;
6861 }
6862
6863 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6864 // the And, just to hoist it again back to the load.
6865 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6866 Ops.push_back(&Insert->getOperandUse(1));
6867 Ops.push_back(&Shuffle->getOperandUse(0));
6868 Ops.push_back(&Op);
6869 }
6870
6871 // It is profitable to sink if we found two of the same type of extends.
6872 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6873 return true;
6874
6875 // Otherwise, see if we should sink splats for indexed variants.
6876 if (!ShouldSinkSplatForIndexedVariant(I))
6877 return false;
6878
6879 Ops.clear();
6880 if (isSplatShuffle(I->getOperand(0)))
6881 Ops.push_back(&I->getOperandUse(0));
6882 if (isSplatShuffle(I->getOperand(1)))
6883 Ops.push_back(&I->getOperandUse(1));
6884
6885 return !Ops.empty();
6886 }
6887 case Instruction::FMul: {
6888 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6889 if (I->getType()->isScalableTy())
6890 return false;
6891
6892 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6893 !ST->hasFullFP16())
6894 return false;
6895
6896 // Sink splats for index lane variants
6897 if (isSplatShuffle(I->getOperand(0)))
6898 Ops.push_back(&I->getOperandUse(0));
6899 if (isSplatShuffle(I->getOperand(1)))
6900 Ops.push_back(&I->getOperandUse(1));
6901 return !Ops.empty();
6902 }
6903 default:
6904 return false;
6905 }
6906 return false;
6907}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned countLeadingOnes() const
Definition APInt.h:1625
void negate()
Negate this APInt in place.
Definition APInt.h:1469
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:760
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...