LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
279const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280 AArch64::FeatureExecuteOnly,
281};
282
284 const Function *Callee) const {
285 SMECallAttrs CallAttrs(*Caller, *Callee);
286
287 // Never inline a function explicitly marked as being streaming,
288 // into a non-streaming function. Assume it was marked as streaming
289 // for a reason.
290 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
292 return false;
293
294 // When inlining, we should consider the body of the function, not the
295 // interface.
296 if (CallAttrs.callee().hasStreamingBody()) {
297 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
298 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
299 }
300
301 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
302 return false;
303
304 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
305 CallAttrs.requiresPreservingZT0() ||
306 CallAttrs.requiresPreservingAllZAState()) {
307 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
308 return false;
309 }
310
311 const TargetMachine &TM = getTLI()->getTargetMachine();
312 const FeatureBitset &CallerBits =
313 TM.getSubtargetImpl(*Caller)->getFeatureBits();
314 const FeatureBitset &CalleeBits =
315 TM.getSubtargetImpl(*Callee)->getFeatureBits();
316 // Adjust the feature bitsets by inverting some of the bits. This is needed
317 // for target features that represent restrictions rather than capabilities,
318 // for example a "+execute-only" callee can be inlined into a caller without
319 // "+execute-only", but not vice versa.
320 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324}
325
327 const Function *Callee,
328 ArrayRef<Type *> Types) const {
329 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330 return false;
331
332 // We need to ensure that argument promotion does not attempt to promote
333 // pointers to fixed-length vector types larger than 128 bits like
334 // <8 x float> (and pointers to aggregate types which have such fixed-length
335 // vector type members) into the values of the pointees. Such vector types
336 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
338 // types can be safely treated as 128-bit NEON types and they cannot be
339 // distinguished in IR.
340 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
341 auto FVTy = dyn_cast<FixedVectorType>(Ty);
342 return FVTy &&
343 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
344 }))
345 return false;
346
347 return true;
348}
349
350unsigned
352 unsigned DefaultCallPenalty) const {
353 // This function calculates a penalty for executing Call in F.
354 //
355 // There are two ways this function can be called:
356 // (1) F:
357 // call from F -> G (the call here is Call)
358 //
359 // For (1), Call.getCaller() == F, so it will always return a high cost if
360 // a streaming-mode change is required (thus promoting the need to inline the
361 // function)
362 //
363 // (2) F:
364 // call from F -> G (the call here is not Call)
365 // G:
366 // call from G -> H (the call here is Call)
367 //
368 // For (2), if after inlining the body of G into F the call to H requires a
369 // streaming-mode change, and the call to G from F would also require a
370 // streaming-mode change, then there is benefit to do the streaming-mode
371 // change only once and avoid inlining of G into F.
372
373 SMEAttrs FAttrs(*F);
374 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
377 if (F == Call.getCaller()) // (1)
378 return CallPenaltyChangeSM * DefaultCallPenalty;
379 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381 }
382
383 return DefaultCallPenalty;
384}
385
389
390 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391 return true;
392
394 ST->isSVEorStreamingSVEAvailable() &&
395 !ST->disableMaximizeScalableBandwidth();
396}
397
398/// Calculate the cost of materializing a 64-bit value. This helper
399/// method might only calculate a fraction of a larger immediate. Therefore it
400/// is valid to return a cost of ZERO.
402 // Check if the immediate can be encoded within an instruction.
403 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
404 return 0;
405
406 if (Val < 0)
407 Val = ~Val;
408
409 // Calculate how many moves we will need to materialize this constant.
411 AArch64_IMM::expandMOVImm(Val, 64, Insn);
412 return Insn.size();
413}
414
415/// Calculate the cost of materializing the given constant.
419 assert(Ty->isIntegerTy());
420
421 unsigned BitSize = Ty->getPrimitiveSizeInBits();
422 if (BitSize == 0)
423 return ~0U;
424
425 // Sign-extend all constants to a multiple of 64-bit.
426 APInt ImmVal = Imm;
427 if (BitSize & 0x3f)
428 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
429
430 // Split the constant into 64-bit chunks and calculate the cost for each
431 // chunk.
433 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
434 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
435 int64_t Val = Tmp.getSExtValue();
436 Cost += getIntImmCost(Val);
437 }
438 // We need at least one instruction to materialze the constant.
439 return std::max<InstructionCost>(1, Cost);
440}
441
443 const APInt &Imm, Type *Ty,
445 Instruction *Inst) const {
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 unsigned ImmIdx = ~0U;
455 switch (Opcode) {
456 default:
457 return TTI::TCC_Free;
458 case Instruction::GetElementPtr:
459 // Always hoist the base address of a GetElementPtr.
460 if (Idx == 0)
461 return 2 * TTI::TCC_Basic;
462 return TTI::TCC_Free;
463 case Instruction::Store:
464 ImmIdx = 0;
465 break;
466 case Instruction::Add:
467 case Instruction::Sub:
468 case Instruction::Mul:
469 case Instruction::UDiv:
470 case Instruction::SDiv:
471 case Instruction::URem:
472 case Instruction::SRem:
473 case Instruction::And:
474 case Instruction::Or:
475 case Instruction::Xor:
476 case Instruction::ICmp:
477 ImmIdx = 1;
478 break;
479 // Always return TCC_Free for the shift value of a shift instruction.
480 case Instruction::Shl:
481 case Instruction::LShr:
482 case Instruction::AShr:
483 if (Idx == 1)
484 return TTI::TCC_Free;
485 break;
486 case Instruction::Trunc:
487 case Instruction::ZExt:
488 case Instruction::SExt:
489 case Instruction::IntToPtr:
490 case Instruction::PtrToInt:
491 case Instruction::BitCast:
492 case Instruction::PHI:
493 case Instruction::Call:
494 case Instruction::Select:
495 case Instruction::Ret:
496 case Instruction::Load:
497 break;
498 }
499
500 if (Idx == ImmIdx) {
501 int NumConstants = (BitSize + 63) / 64;
503 return (Cost <= NumConstants * TTI::TCC_Basic)
504 ? static_cast<int>(TTI::TCC_Free)
505 : Cost;
506 }
508}
509
512 const APInt &Imm, Type *Ty,
514 assert(Ty->isIntegerTy());
515
516 unsigned BitSize = Ty->getPrimitiveSizeInBits();
517 // There is no cost model for constants with a bit size of 0. Return TCC_Free
518 // here, so that constant hoisting will ignore this constant.
519 if (BitSize == 0)
520 return TTI::TCC_Free;
521
522 // Most (all?) AArch64 intrinsics do not support folding immediates into the
523 // selected instruction, so we compute the materialization cost for the
524 // immediate directly.
525 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
527
528 switch (IID) {
529 default:
530 return TTI::TCC_Free;
531 case Intrinsic::sadd_with_overflow:
532 case Intrinsic::uadd_with_overflow:
533 case Intrinsic::ssub_with_overflow:
534 case Intrinsic::usub_with_overflow:
535 case Intrinsic::smul_with_overflow:
536 case Intrinsic::umul_with_overflow:
537 if (Idx == 1) {
538 int NumConstants = (BitSize + 63) / 64;
540 return (Cost <= NumConstants * TTI::TCC_Basic)
541 ? static_cast<int>(TTI::TCC_Free)
542 : Cost;
543 }
544 break;
545 case Intrinsic::experimental_stackmap:
546 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
547 return TTI::TCC_Free;
548 break;
549 case Intrinsic::experimental_patchpoint_void:
550 case Intrinsic::experimental_patchpoint:
551 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
552 return TTI::TCC_Free;
553 break;
554 case Intrinsic::experimental_gc_statepoint:
555 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
556 return TTI::TCC_Free;
557 break;
558 }
560}
561
563AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565 if (TyWidth == 32 || TyWidth == 64)
567 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568 return TTI::PSK_Software;
569}
570
571static bool isUnpackedVectorVT(EVT VecVT) {
572 return VecVT.isScalableVector() &&
574}
575
577 const IntrinsicCostAttributes &ICA) {
578 // We need to know at least the number of elements in the vector of buckets
579 // and the size of each element to update.
580 if (ICA.getArgTypes().size() < 2)
582
583 // Only interested in costing for the hardware instruction from SVE2.
584 if (!ST->hasSVE2())
586
587 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
588 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
589 unsigned TotalHistCnts = 1;
590
591 unsigned EltSize = EltTy->getScalarSizeInBits();
592 // Only allow (up to 64b) integers or pointers
593 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
595
596 // FIXME: We should be able to generate histcnt for fixed-length vectors
597 // using ptrue with a specific VL.
598 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
599 unsigned EC = VTy->getElementCount().getKnownMinValue();
600 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
602
603 // HistCnt only supports 32b and 64b element types
604 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
605
606 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
608
609 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610 TotalHistCnts = EC / NaturalVectorWidth;
611
612 return InstructionCost(BaseHistCntCost * TotalHistCnts);
613 }
614
616}
617
621 // The code-generator is currently not able to handle scalable vectors
622 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623 // it. This change will be removed when code-generation for these types is
624 // sufficiently reliable.
625 auto *RetTy = ICA.getReturnType();
626 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
627 if (VTy->getElementCount() == ElementCount::getScalable(1))
629
630 switch (ICA.getID()) {
631 case Intrinsic::experimental_vector_histogram_add: {
632 InstructionCost HistCost = getHistogramCost(ST, ICA);
633 // If the cost isn't valid, we may still be able to scalarize
634 if (HistCost.isValid())
635 return HistCost;
636 break;
637 }
638 case Intrinsic::umin:
639 case Intrinsic::umax:
640 case Intrinsic::smin:
641 case Intrinsic::smax: {
642 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643 MVT::v8i16, MVT::v2i32, MVT::v4i32,
644 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645 MVT::nxv2i64};
646 auto LT = getTypeLegalizationCost(RetTy);
647 // v2i64 types get converted to cmp+bif hence the cost of 2
648 if (LT.second == MVT::v2i64)
649 return LT.first * 2;
650 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
651 return LT.first;
652 break;
653 }
654 case Intrinsic::sadd_sat:
655 case Intrinsic::ssub_sat:
656 case Intrinsic::uadd_sat:
657 case Intrinsic::usub_sat: {
658 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
659 MVT::v8i16, MVT::v2i32, MVT::v4i32,
660 MVT::v2i64};
661 auto LT = getTypeLegalizationCost(RetTy);
662 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
663 // need to extend the type, as it uses shr(qadd(shl, shl)).
664 unsigned Instrs =
665 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
666 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
667 return LT.first * Instrs;
668
670 uint64_t VectorSize = TS.getKnownMinValue();
671
672 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
673 return LT.first * Instrs;
674
675 break;
676 }
677 case Intrinsic::abs: {
678 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
679 MVT::v8i16, MVT::v2i32, MVT::v4i32,
680 MVT::v2i64};
681 auto LT = getTypeLegalizationCost(RetTy);
682 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
683 return LT.first;
684 break;
685 }
686 case Intrinsic::bswap: {
687 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
688 MVT::v4i32, MVT::v2i64};
689 auto LT = getTypeLegalizationCost(RetTy);
690 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
691 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
692 return LT.first;
693 break;
694 }
695 case Intrinsic::fma:
696 case Intrinsic::fmuladd: {
697 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
698 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
699 Type *EltTy = RetTy->getScalarType();
700 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
701 (EltTy->isHalfTy() && ST->hasFullFP16()))
702 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
703 break;
704 }
705 case Intrinsic::stepvector: {
706 InstructionCost Cost = 1; // Cost of the `index' instruction
707 auto LT = getTypeLegalizationCost(RetTy);
708 // Legalisation of illegal vectors involves an `index' instruction plus
709 // (LT.first - 1) vector adds.
710 if (LT.first > 1) {
711 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
712 InstructionCost AddCost =
713 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
714 Cost += AddCost * (LT.first - 1);
715 }
716 return Cost;
717 }
718 case Intrinsic::vector_extract:
719 case Intrinsic::vector_insert: {
720 // If both the vector and subvector types are legal types and the index
721 // is 0, then this should be a no-op or simple operation; return a
722 // relatively low cost.
723
724 // If arguments aren't actually supplied, then we cannot determine the
725 // value of the index. We also want to skip predicate types.
726 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
728 break;
729
730 LLVMContext &C = RetTy->getContext();
731 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
732 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
733 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
734 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
735 // Skip this if either the vector or subvector types are unpacked
736 // SVE types; they may get lowered to stack stores and loads.
737 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
738 break;
739
741 getTLI()->getTypeConversion(C, SubVecVT);
743 getTLI()->getTypeConversion(C, VecVT);
744 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
745 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
746 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
747 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
748 return TTI::TCC_Free;
749 break;
750 }
751 case Intrinsic::bitreverse: {
752 static const CostTblEntry BitreverseTbl[] = {
753 {Intrinsic::bitreverse, MVT::i32, 1},
754 {Intrinsic::bitreverse, MVT::i64, 1},
755 {Intrinsic::bitreverse, MVT::v8i8, 1},
756 {Intrinsic::bitreverse, MVT::v16i8, 1},
757 {Intrinsic::bitreverse, MVT::v4i16, 2},
758 {Intrinsic::bitreverse, MVT::v8i16, 2},
759 {Intrinsic::bitreverse, MVT::v2i32, 2},
760 {Intrinsic::bitreverse, MVT::v4i32, 2},
761 {Intrinsic::bitreverse, MVT::v1i64, 2},
762 {Intrinsic::bitreverse, MVT::v2i64, 2},
763 };
764 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
765 const auto *Entry =
766 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
767 if (Entry) {
768 // Cost Model is using the legal type(i32) that i8 and i16 will be
769 // converted to +1 so that we match the actual lowering cost
770 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
771 TLI->getValueType(DL, RetTy, true) == MVT::i16)
772 return LegalisationCost.first * Entry->Cost + 1;
773
774 return LegalisationCost.first * Entry->Cost;
775 }
776 break;
777 }
778 case Intrinsic::ctpop: {
779 if (!ST->hasNEON()) {
780 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
781 return getTypeLegalizationCost(RetTy).first * 12;
782 }
783 static const CostTblEntry CtpopCostTbl[] = {
784 {ISD::CTPOP, MVT::v2i64, 4},
785 {ISD::CTPOP, MVT::v4i32, 3},
786 {ISD::CTPOP, MVT::v8i16, 2},
787 {ISD::CTPOP, MVT::v16i8, 1},
788 {ISD::CTPOP, MVT::i64, 4},
789 {ISD::CTPOP, MVT::v2i32, 3},
790 {ISD::CTPOP, MVT::v4i16, 2},
791 {ISD::CTPOP, MVT::v8i8, 1},
792 {ISD::CTPOP, MVT::i32, 5},
793 };
794 auto LT = getTypeLegalizationCost(RetTy);
795 MVT MTy = LT.second;
796 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
797 // Extra cost of +1 when illegal vector types are legalized by promoting
798 // the integer type.
799 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
800 RetTy->getScalarSizeInBits()
801 ? 1
802 : 0;
803 return LT.first * Entry->Cost + ExtraCost;
804 }
805 break;
806 }
807 case Intrinsic::sadd_with_overflow:
808 case Intrinsic::uadd_with_overflow:
809 case Intrinsic::ssub_with_overflow:
810 case Intrinsic::usub_with_overflow:
811 case Intrinsic::smul_with_overflow:
812 case Intrinsic::umul_with_overflow: {
813 static const CostTblEntry WithOverflowCostTbl[] = {
814 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
815 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
816 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
817 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
818 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
819 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
820 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
821 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
822 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
823 {Intrinsic::usub_with_overflow, MVT::i8, 3},
824 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
825 {Intrinsic::usub_with_overflow, MVT::i16, 3},
826 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
827 {Intrinsic::usub_with_overflow, MVT::i32, 1},
828 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
829 {Intrinsic::usub_with_overflow, MVT::i64, 1},
830 {Intrinsic::smul_with_overflow, MVT::i8, 5},
831 {Intrinsic::umul_with_overflow, MVT::i8, 4},
832 {Intrinsic::smul_with_overflow, MVT::i16, 5},
833 {Intrinsic::umul_with_overflow, MVT::i16, 4},
834 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
835 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
836 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
837 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
838 };
839 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
840 if (MTy.isSimple())
841 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
842 MTy.getSimpleVT()))
843 return Entry->Cost;
844 break;
845 }
846 case Intrinsic::fptosi_sat:
847 case Intrinsic::fptoui_sat: {
848 if (ICA.getArgTypes().empty())
849 break;
850 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
851 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
852 EVT MTy = TLI->getValueType(DL, RetTy);
853 // Check for the legal types, which are where the size of the input and the
854 // output are the same, or we are using cvt f64->i32 or f32->i64.
855 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
856 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
857 LT.second == MVT::v2f64)) {
858 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
859 (LT.second == MVT::f64 && MTy == MVT::i32) ||
860 (LT.second == MVT::f32 && MTy == MVT::i64)))
861 return LT.first;
862 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
863 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
864 MTy.getScalarSizeInBits() == 64)
865 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
866 }
867 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
868 // f32.
869 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
870 return LT.first + getIntrinsicInstrCost(
871 {ICA.getID(),
872 RetTy,
873 {ICA.getArgTypes()[0]->getWithNewType(
874 Type::getFloatTy(RetTy->getContext()))}},
875 CostKind);
876 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
877 (LT.second == MVT::f16 && MTy == MVT::i64) ||
878 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
879 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
880 return LT.first;
881 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
882 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
883 MTy.getScalarSizeInBits() == 32)
884 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
885 // Extending vector types v8f16->v8i32. These current scalarize but the
886 // codegen could be better.
887 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
888 MTy.getScalarSizeInBits() == 64)
889 return MTy.getVectorNumElements() * 3;
890
891 // If we can we use a legal convert followed by a min+max
892 if ((LT.second.getScalarType() == MVT::f32 ||
893 LT.second.getScalarType() == MVT::f64 ||
894 LT.second.getScalarType() == MVT::f16) &&
895 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
896 Type *LegalTy =
897 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
898 if (LT.second.isVector())
899 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
901 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
902 LegalTy, {LegalTy, LegalTy});
904 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
905 LegalTy, {LegalTy, LegalTy});
907 return LT.first * Cost +
908 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
909 : 1);
910 }
911 // Otherwise we need to follow the default expansion that clamps the value
912 // using a float min/max with a fcmp+sel for nan handling when signed.
913 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
914 RetTy = RetTy->getScalarType();
915 if (LT.second.isVector()) {
916 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
917 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
918 }
919 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
921 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
923 Cost +=
924 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
926 if (IsSigned) {
927 Type *CondTy = RetTy->getWithNewBitWidth(1);
928 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
930 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
932 }
933 return LT.first * Cost;
934 }
935 case Intrinsic::fshl:
936 case Intrinsic::fshr: {
937 if (ICA.getArgs().empty())
938 break;
939
940 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
941
942 // ROTR / ROTL is a funnel shift with equal first and second operand. For
943 // ROTR on integer registers (i32/i64) this can be done in a single ror
944 // instruction. A fshl with a non-constant shift uses a neg + ror.
945 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
946 (RetTy->getPrimitiveSizeInBits() == 32 ||
947 RetTy->getPrimitiveSizeInBits() == 64)) {
948 InstructionCost NegCost =
949 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
950 return 1 + NegCost;
951 }
952
953 // TODO: Add handling for fshl where third argument is not a constant.
954 if (!OpInfoZ.isConstant())
955 break;
956
957 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
958 if (OpInfoZ.isUniform()) {
959 static const CostTblEntry FshlTbl[] = {
960 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
961 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
962 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
963 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
964 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
965 // to avoid having to duplicate the costs.
966 const auto *Entry =
967 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
968 if (Entry)
969 return LegalisationCost.first * Entry->Cost;
970 }
971
972 auto TyL = getTypeLegalizationCost(RetTy);
973 if (!RetTy->isIntegerTy())
974 break;
975
976 // Estimate cost manually, as types like i8 and i16 will get promoted to
977 // i32 and CostTableLookup will ignore the extra conversion cost.
978 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
979 RetTy->getScalarSizeInBits() < 64) ||
980 (RetTy->getScalarSizeInBits() % 64 != 0);
981 unsigned ExtraCost = HigherCost ? 1 : 0;
982 if (RetTy->getScalarSizeInBits() == 32 ||
983 RetTy->getScalarSizeInBits() == 64)
984 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
985 // extr instruction.
986 else if (HigherCost)
987 ExtraCost = 1;
988 else
989 break;
990 return TyL.first + ExtraCost;
991 }
992 case Intrinsic::get_active_lane_mask: {
993 auto RetTy = cast<VectorType>(ICA.getReturnType());
994 EVT RetVT = getTLI()->getValueType(DL, RetTy);
995 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
996 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
997 break;
998
999 if (RetTy->isScalableTy()) {
1000 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1002 break;
1003
1004 auto LT = getTypeLegalizationCost(RetTy);
1005 InstructionCost Cost = LT.first;
1006 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1007 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1008 // nxv32i1 = get_active_lane_mask(base, idx) ->
1009 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1010 if (ST->hasSVE2p1() || ST->hasSME2()) {
1011 Cost /= 2;
1012 if (Cost == 1)
1013 return Cost;
1014 }
1015
1016 // If more than one whilelo intrinsic is required, include the extra cost
1017 // required by the saturating add & select required to increment the
1018 // start value after the first intrinsic call.
1019 Type *OpTy = ICA.getArgTypes()[0];
1020 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1021 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1022 Type *CondTy = OpTy->getWithNewBitWidth(1);
1023 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1025 return Cost + (SplitCost * (Cost - 1));
1026 } else if (!getTLI()->isTypeLegal(RetVT)) {
1027 // We don't have enough context at this point to determine if the mask
1028 // is going to be kept live after the block, which will force the vXi1
1029 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1030 // For now, we just assume the vectorizer created this intrinsic and
1031 // the result will be the input for a PHI. In this case the cost will
1032 // be extremely high for fixed-width vectors.
1033 // NOTE: getScalarizationOverhead returns a cost that's far too
1034 // pessimistic for the actual generated codegen. In reality there are
1035 // two instructions generated per lane.
1036 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1037 }
1038 break;
1039 }
1040 case Intrinsic::experimental_vector_match: {
1041 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1042 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1043 unsigned SearchSize = NeedleTy->getNumElements();
1044 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1045 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1046 // Neoverse V3, these are cheap operations with the same latency as a
1047 // vector ADD. In most cases, however, we also need to do an extra DUP.
1048 // For fixed-length vectors we currently need an extra five--six
1049 // instructions besides the MATCH.
1051 if (isa<FixedVectorType>(RetTy))
1052 Cost += 10;
1053 return Cost;
1054 }
1055 break;
1056 }
1057 case Intrinsic::experimental_cttz_elts: {
1058 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1059 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1060 // This will consist of a SVE brkb and a cntp instruction. These
1061 // typically have the same latency and half the throughput as a vector
1062 // add instruction.
1063 return 4;
1064 }
1065 break;
1066 }
1067 case Intrinsic::experimental_vector_extract_last_active:
1068 if (ST->isSVEorStreamingSVEAvailable()) {
1069 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1070 // This should turn into chained clastb instructions.
1071 return LegalCost;
1072 }
1073 break;
1074 default:
1075 break;
1076 }
1078}
1079
1080/// The function will remove redundant reinterprets casting in the presence
1081/// of the control flow
1082static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1083 IntrinsicInst &II) {
1085 auto RequiredType = II.getType();
1086
1087 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1088 assert(PN && "Expected Phi Node!");
1089
1090 // Don't create a new Phi unless we can remove the old one.
1091 if (!PN->hasOneUse())
1092 return std::nullopt;
1093
1094 for (Value *IncValPhi : PN->incoming_values()) {
1095 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1096 if (!Reinterpret ||
1097 Reinterpret->getIntrinsicID() !=
1098 Intrinsic::aarch64_sve_convert_to_svbool ||
1099 RequiredType != Reinterpret->getArgOperand(0)->getType())
1100 return std::nullopt;
1101 }
1102
1103 // Create the new Phi
1104 IC.Builder.SetInsertPoint(PN);
1105 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1106 Worklist.push_back(PN);
1107
1108 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1109 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1110 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1111 Worklist.push_back(Reinterpret);
1112 }
1113
1114 // Cleanup Phi Node and reinterprets
1115 return IC.replaceInstUsesWith(II, NPN);
1116}
1117
1118// A collection of properties common to SVE intrinsics that allow for combines
1119// to be written without needing to know the specific intrinsic.
1121 //
1122 // Helper routines for common intrinsic definitions.
1123 //
1124
1125 // e.g. llvm.aarch64.sve.add pg, op1, op2
1126 // with IID ==> llvm.aarch64.sve.add_u
1127 static SVEIntrinsicInfo
1134
1135 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1142
1143 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1149
1150 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1156
1157 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1158 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1159 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1160 return SVEIntrinsicInfo()
1163 }
1164
1165 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1166 // llvm.aarch64.sve.ld1 pg, ptr
1173
1174 // All properties relate to predication and thus having a general predicate
1175 // is the minimum requirement to say there is intrinsic info to act on.
1176 explicit operator bool() const { return hasGoverningPredicate(); }
1177
1178 //
1179 // Properties relating to the governing predicate.
1180 //
1181
1183 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1184 }
1185
1187 assert(hasGoverningPredicate() && "Propery not set!");
1188 return GoverningPredicateIdx;
1189 }
1190
1192 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1193 GoverningPredicateIdx = Index;
1194 return *this;
1195 }
1196
1197 //
1198 // Properties relating to operations the intrinsic could be transformed into.
1199 // NOTE: This does not mean such a transformation is always possible, but the
1200 // knowledge makes it possible to reuse existing optimisations without needing
1201 // to embed specific handling for each intrinsic. For example, instruction
1202 // simplification can be used to optimise an intrinsic's active lanes.
1203 //
1204
1206 return UndefIntrinsic != Intrinsic::not_intrinsic;
1207 }
1208
1210 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1211 return UndefIntrinsic;
1212 }
1213
1215 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1216 UndefIntrinsic = IID;
1217 return *this;
1218 }
1219
1220 bool hasMatchingIROpode() const { return IROpcode != 0; }
1221
1222 unsigned getMatchingIROpode() const {
1223 assert(hasMatchingIROpode() && "Propery not set!");
1224 return IROpcode;
1225 }
1226
1228 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1229 IROpcode = Opcode;
1230 return *this;
1231 }
1232
1233 //
1234 // Properties relating to the result of inactive lanes.
1235 //
1236
1238 return ResultLanes == InactiveLanesTakenFromOperand;
1239 }
1240
1242 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1243 return OperandIdxForInactiveLanes;
1244 }
1245
1247 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1248 ResultLanes = InactiveLanesTakenFromOperand;
1249 OperandIdxForInactiveLanes = Index;
1250 return *this;
1251 }
1252
1254 return ResultLanes == InactiveLanesAreNotDefined;
1255 }
1256
1258 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1259 ResultLanes = InactiveLanesAreNotDefined;
1260 return *this;
1261 }
1262
1264 return ResultLanes == InactiveLanesAreUnused;
1265 }
1266
1268 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1269 ResultLanes = InactiveLanesAreUnused;
1270 return *this;
1271 }
1272
1273 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1274 // inactiveLanesAreZeroed =
1275 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1276 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1277
1279 ResultIsZeroInitialized = true;
1280 return *this;
1281 }
1282
1283 //
1284 // The first operand of unary merging operations is typically only used to
1285 // set the result for inactive lanes. Knowing this allows us to deadcode the
1286 // operand when we can prove there are no inactive lanes.
1287 //
1288
1290 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1291 }
1292
1294 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1295 return OperandIdxWithNoActiveLanes;
1296 }
1297
1299 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1300 OperandIdxWithNoActiveLanes = Index;
1301 return *this;
1302 }
1303
1304private:
1305 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1306
1307 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1308 unsigned IROpcode = 0;
1309
1310 enum PredicationStyle {
1312 InactiveLanesTakenFromOperand,
1313 InactiveLanesAreNotDefined,
1314 InactiveLanesAreUnused
1315 } ResultLanes = Uninitialized;
1316
1317 bool ResultIsZeroInitialized = false;
1318 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1319 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1320};
1321
1323 // Some SVE intrinsics do not use scalable vector types, but since they are
1324 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1325 if (!isa<ScalableVectorType>(II.getType()) &&
1326 all_of(II.args(), [&](const Value *V) {
1327 return !isa<ScalableVectorType>(V->getType());
1328 }))
1329 return SVEIntrinsicInfo();
1330
1331 Intrinsic::ID IID = II.getIntrinsicID();
1332 switch (IID) {
1333 default:
1334 break;
1335 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1336 case Intrinsic::aarch64_sve_fcvt_f16f32:
1337 case Intrinsic::aarch64_sve_fcvt_f16f64:
1338 case Intrinsic::aarch64_sve_fcvt_f32f16:
1339 case Intrinsic::aarch64_sve_fcvt_f32f64:
1340 case Intrinsic::aarch64_sve_fcvt_f64f16:
1341 case Intrinsic::aarch64_sve_fcvt_f64f32:
1342 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1343 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1344 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1345 case Intrinsic::aarch64_sve_fcvtzs:
1346 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1347 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1348 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1349 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1350 case Intrinsic::aarch64_sve_fcvtzu:
1351 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1352 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1353 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1354 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1355 case Intrinsic::aarch64_sve_scvtf:
1356 case Intrinsic::aarch64_sve_scvtf_f16i32:
1357 case Intrinsic::aarch64_sve_scvtf_f16i64:
1358 case Intrinsic::aarch64_sve_scvtf_f32i64:
1359 case Intrinsic::aarch64_sve_scvtf_f64i32:
1360 case Intrinsic::aarch64_sve_ucvtf:
1361 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1362 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1363 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1364 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1366
1367 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1368 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1369 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1370 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1372
1373 case Intrinsic::aarch64_sve_fabd:
1374 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1375 case Intrinsic::aarch64_sve_fadd:
1376 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1377 .setMatchingIROpcode(Instruction::FAdd);
1378 case Intrinsic::aarch64_sve_fdiv:
1379 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1380 .setMatchingIROpcode(Instruction::FDiv);
1381 case Intrinsic::aarch64_sve_fmax:
1382 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1383 case Intrinsic::aarch64_sve_fmaxnm:
1384 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1385 case Intrinsic::aarch64_sve_fmin:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1387 case Intrinsic::aarch64_sve_fminnm:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1389 case Intrinsic::aarch64_sve_fmla:
1390 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1391 case Intrinsic::aarch64_sve_fmls:
1392 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1393 case Intrinsic::aarch64_sve_fmul:
1394 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1395 .setMatchingIROpcode(Instruction::FMul);
1396 case Intrinsic::aarch64_sve_fmulx:
1397 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1398 case Intrinsic::aarch64_sve_fnmla:
1399 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1400 case Intrinsic::aarch64_sve_fnmls:
1401 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1402 case Intrinsic::aarch64_sve_fsub:
1403 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1404 .setMatchingIROpcode(Instruction::FSub);
1405 case Intrinsic::aarch64_sve_add:
1406 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1407 .setMatchingIROpcode(Instruction::Add);
1408 case Intrinsic::aarch64_sve_mla:
1409 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1410 case Intrinsic::aarch64_sve_mls:
1411 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1412 case Intrinsic::aarch64_sve_mul:
1413 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1414 .setMatchingIROpcode(Instruction::Mul);
1415 case Intrinsic::aarch64_sve_sabd:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1417 case Intrinsic::aarch64_sve_sdiv:
1418 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1419 .setMatchingIROpcode(Instruction::SDiv);
1420 case Intrinsic::aarch64_sve_smax:
1421 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1422 case Intrinsic::aarch64_sve_smin:
1423 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1424 case Intrinsic::aarch64_sve_smulh:
1425 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1426 case Intrinsic::aarch64_sve_sub:
1427 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1428 .setMatchingIROpcode(Instruction::Sub);
1429 case Intrinsic::aarch64_sve_uabd:
1430 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1431 case Intrinsic::aarch64_sve_udiv:
1432 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1433 .setMatchingIROpcode(Instruction::UDiv);
1434 case Intrinsic::aarch64_sve_umax:
1435 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1436 case Intrinsic::aarch64_sve_umin:
1437 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1438 case Intrinsic::aarch64_sve_umulh:
1439 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1440 case Intrinsic::aarch64_sve_asr:
1441 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1442 .setMatchingIROpcode(Instruction::AShr);
1443 case Intrinsic::aarch64_sve_lsl:
1444 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1445 .setMatchingIROpcode(Instruction::Shl);
1446 case Intrinsic::aarch64_sve_lsr:
1447 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1448 .setMatchingIROpcode(Instruction::LShr);
1449 case Intrinsic::aarch64_sve_and:
1450 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1451 .setMatchingIROpcode(Instruction::And);
1452 case Intrinsic::aarch64_sve_bic:
1453 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1454 case Intrinsic::aarch64_sve_eor:
1455 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1456 .setMatchingIROpcode(Instruction::Xor);
1457 case Intrinsic::aarch64_sve_orr:
1458 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1459 .setMatchingIROpcode(Instruction::Or);
1460 case Intrinsic::aarch64_sve_sqrshl:
1461 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1462 case Intrinsic::aarch64_sve_sqshl:
1463 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1464 case Intrinsic::aarch64_sve_sqsub:
1465 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1466 case Intrinsic::aarch64_sve_srshl:
1467 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1468 case Intrinsic::aarch64_sve_uqrshl:
1469 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1470 case Intrinsic::aarch64_sve_uqshl:
1471 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1472 case Intrinsic::aarch64_sve_uqsub:
1473 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1474 case Intrinsic::aarch64_sve_urshl:
1475 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1476
1477 case Intrinsic::aarch64_sve_add_u:
1479 Instruction::Add);
1480 case Intrinsic::aarch64_sve_and_u:
1482 Instruction::And);
1483 case Intrinsic::aarch64_sve_asr_u:
1485 Instruction::AShr);
1486 case Intrinsic::aarch64_sve_eor_u:
1488 Instruction::Xor);
1489 case Intrinsic::aarch64_sve_fadd_u:
1491 Instruction::FAdd);
1492 case Intrinsic::aarch64_sve_fdiv_u:
1494 Instruction::FDiv);
1495 case Intrinsic::aarch64_sve_fmul_u:
1497 Instruction::FMul);
1498 case Intrinsic::aarch64_sve_fsub_u:
1500 Instruction::FSub);
1501 case Intrinsic::aarch64_sve_lsl_u:
1503 Instruction::Shl);
1504 case Intrinsic::aarch64_sve_lsr_u:
1506 Instruction::LShr);
1507 case Intrinsic::aarch64_sve_mul_u:
1509 Instruction::Mul);
1510 case Intrinsic::aarch64_sve_orr_u:
1512 Instruction::Or);
1513 case Intrinsic::aarch64_sve_sdiv_u:
1515 Instruction::SDiv);
1516 case Intrinsic::aarch64_sve_sub_u:
1518 Instruction::Sub);
1519 case Intrinsic::aarch64_sve_udiv_u:
1521 Instruction::UDiv);
1522
1523 case Intrinsic::aarch64_sve_addqv:
1524 case Intrinsic::aarch64_sve_and_z:
1525 case Intrinsic::aarch64_sve_bic_z:
1526 case Intrinsic::aarch64_sve_brka_z:
1527 case Intrinsic::aarch64_sve_brkb_z:
1528 case Intrinsic::aarch64_sve_brkn_z:
1529 case Intrinsic::aarch64_sve_brkpa_z:
1530 case Intrinsic::aarch64_sve_brkpb_z:
1531 case Intrinsic::aarch64_sve_cntp:
1532 case Intrinsic::aarch64_sve_compact:
1533 case Intrinsic::aarch64_sve_eor_z:
1534 case Intrinsic::aarch64_sve_eorv:
1535 case Intrinsic::aarch64_sve_eorqv:
1536 case Intrinsic::aarch64_sve_nand_z:
1537 case Intrinsic::aarch64_sve_nor_z:
1538 case Intrinsic::aarch64_sve_orn_z:
1539 case Intrinsic::aarch64_sve_orr_z:
1540 case Intrinsic::aarch64_sve_orv:
1541 case Intrinsic::aarch64_sve_orqv:
1542 case Intrinsic::aarch64_sve_pnext:
1543 case Intrinsic::aarch64_sve_rdffr_z:
1544 case Intrinsic::aarch64_sve_saddv:
1545 case Intrinsic::aarch64_sve_uaddv:
1546 case Intrinsic::aarch64_sve_umaxv:
1547 case Intrinsic::aarch64_sve_umaxqv:
1548 case Intrinsic::aarch64_sve_cmpeq:
1549 case Intrinsic::aarch64_sve_cmpeq_wide:
1550 case Intrinsic::aarch64_sve_cmpge:
1551 case Intrinsic::aarch64_sve_cmpge_wide:
1552 case Intrinsic::aarch64_sve_cmpgt:
1553 case Intrinsic::aarch64_sve_cmpgt_wide:
1554 case Intrinsic::aarch64_sve_cmphi:
1555 case Intrinsic::aarch64_sve_cmphi_wide:
1556 case Intrinsic::aarch64_sve_cmphs:
1557 case Intrinsic::aarch64_sve_cmphs_wide:
1558 case Intrinsic::aarch64_sve_cmple_wide:
1559 case Intrinsic::aarch64_sve_cmplo_wide:
1560 case Intrinsic::aarch64_sve_cmpls_wide:
1561 case Intrinsic::aarch64_sve_cmplt_wide:
1562 case Intrinsic::aarch64_sve_cmpne:
1563 case Intrinsic::aarch64_sve_cmpne_wide:
1564 case Intrinsic::aarch64_sve_facge:
1565 case Intrinsic::aarch64_sve_facgt:
1566 case Intrinsic::aarch64_sve_fcmpeq:
1567 case Intrinsic::aarch64_sve_fcmpge:
1568 case Intrinsic::aarch64_sve_fcmpgt:
1569 case Intrinsic::aarch64_sve_fcmpne:
1570 case Intrinsic::aarch64_sve_fcmpuo:
1571 case Intrinsic::aarch64_sve_ld1:
1572 case Intrinsic::aarch64_sve_ld1_gather:
1573 case Intrinsic::aarch64_sve_ld1_gather_index:
1574 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1575 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1576 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1577 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1578 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1579 case Intrinsic::aarch64_sve_ld1q_gather_index:
1580 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1581 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1582 case Intrinsic::aarch64_sve_ld1ro:
1583 case Intrinsic::aarch64_sve_ld1rq:
1584 case Intrinsic::aarch64_sve_ld1udq:
1585 case Intrinsic::aarch64_sve_ld1uwq:
1586 case Intrinsic::aarch64_sve_ld2_sret:
1587 case Intrinsic::aarch64_sve_ld2q_sret:
1588 case Intrinsic::aarch64_sve_ld3_sret:
1589 case Intrinsic::aarch64_sve_ld3q_sret:
1590 case Intrinsic::aarch64_sve_ld4_sret:
1591 case Intrinsic::aarch64_sve_ld4q_sret:
1592 case Intrinsic::aarch64_sve_ldff1:
1593 case Intrinsic::aarch64_sve_ldff1_gather:
1594 case Intrinsic::aarch64_sve_ldff1_gather_index:
1595 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1596 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1597 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1598 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1599 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1600 case Intrinsic::aarch64_sve_ldnf1:
1601 case Intrinsic::aarch64_sve_ldnt1:
1602 case Intrinsic::aarch64_sve_ldnt1_gather:
1603 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1604 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1605 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1607
1608 case Intrinsic::aarch64_sve_prf:
1609 case Intrinsic::aarch64_sve_prfb_gather_index:
1610 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1611 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1612 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1613 case Intrinsic::aarch64_sve_prfd_gather_index:
1614 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1615 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1616 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1617 case Intrinsic::aarch64_sve_prfh_gather_index:
1618 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1619 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1620 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1621 case Intrinsic::aarch64_sve_prfw_gather_index:
1622 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1623 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1624 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1626
1627 case Intrinsic::aarch64_sve_st1_scatter:
1628 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1629 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1630 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1631 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1632 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1633 case Intrinsic::aarch64_sve_st1dq:
1634 case Intrinsic::aarch64_sve_st1q_scatter_index:
1635 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1636 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1637 case Intrinsic::aarch64_sve_st1wq:
1638 case Intrinsic::aarch64_sve_stnt1:
1639 case Intrinsic::aarch64_sve_stnt1_scatter:
1640 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1641 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1642 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1644 case Intrinsic::aarch64_sve_st2:
1645 case Intrinsic::aarch64_sve_st2q:
1647 case Intrinsic::aarch64_sve_st3:
1648 case Intrinsic::aarch64_sve_st3q:
1650 case Intrinsic::aarch64_sve_st4:
1651 case Intrinsic::aarch64_sve_st4q:
1653 }
1654
1655 return SVEIntrinsicInfo();
1656}
1657
1658static bool isAllActivePredicate(Value *Pred) {
1659 Value *UncastedPred;
1660
1661 // Look through predicate casts that only remove lanes.
1663 m_Value(UncastedPred)))) {
1664 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1665 Pred = UncastedPred;
1666
1668 m_Value(UncastedPred))))
1669 // If the predicate has the same or less lanes than the uncasted predicate
1670 // then we know the casting has no effect.
1671 if (OrigPredTy->getMinNumElements() <=
1672 cast<ScalableVectorType>(UncastedPred->getType())
1673 ->getMinNumElements())
1674 Pred = UncastedPred;
1675 }
1676
1677 auto *C = dyn_cast<Constant>(Pred);
1678 return C && C->isAllOnesValue();
1679}
1680
1681// Simplify `V` by only considering the operations that affect active lanes.
1682// This function should only return existing Values or newly created Constants.
1683static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1684 auto *Dup = dyn_cast<IntrinsicInst>(V);
1685 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1686 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1688 cast<VectorType>(V->getType())->getElementCount(),
1689 cast<Constant>(Dup->getOperand(2)));
1690
1691 return V;
1692}
1693
1694static std::optional<Instruction *>
1696 const SVEIntrinsicInfo &IInfo) {
1697 const unsigned Opc = IInfo.getMatchingIROpode();
1698 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1699
1700 Value *Pg = II.getOperand(0);
1701 Value *Op1 = II.getOperand(1);
1702 Value *Op2 = II.getOperand(2);
1703 const DataLayout &DL = II.getDataLayout();
1704
1705 // Canonicalise constants to the RHS.
1707 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1708 IC.replaceOperand(II, 1, Op2);
1709 IC.replaceOperand(II, 2, Op1);
1710 return &II;
1711 }
1712
1713 // Only active lanes matter when simplifying the operation.
1714 Op1 = stripInactiveLanes(Op1, Pg);
1715 Op2 = stripInactiveLanes(Op2, Pg);
1716
1717 Value *SimpleII;
1718 if (auto FII = dyn_cast<FPMathOperator>(&II))
1719 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1720 else
1721 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1722
1723 // An SVE intrinsic's result is always defined. However, this is not the case
1724 // for its equivalent IR instruction (e.g. when shifting by an amount more
1725 // than the data's bitwidth). Simplifications to an undefined result must be
1726 // ignored to preserve the intrinsic's expected behaviour.
1727 if (!SimpleII || isa<UndefValue>(SimpleII))
1728 return std::nullopt;
1729
1730 if (IInfo.inactiveLanesAreNotDefined())
1731 return IC.replaceInstUsesWith(II, SimpleII);
1732
1733 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1734
1735 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1736 if (SimpleII == Inactive)
1737 return IC.replaceInstUsesWith(II, SimpleII);
1738
1739 // Inactive lanes must be preserved.
1740 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1741 return IC.replaceInstUsesWith(II, SimpleII);
1742}
1743
1744// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1745// to operations with less strict inactive lane requirements.
1746static std::optional<Instruction *>
1748 const SVEIntrinsicInfo &IInfo) {
1749 if (!IInfo.hasGoverningPredicate())
1750 return std::nullopt;
1751
1752 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1753
1754 // If there are no active lanes.
1755 if (match(OpPredicate, m_ZeroInt())) {
1757 return IC.replaceInstUsesWith(
1758 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1759
1760 if (IInfo.inactiveLanesAreUnused()) {
1761 if (IInfo.resultIsZeroInitialized())
1763
1764 return IC.eraseInstFromFunction(II);
1765 }
1766 }
1767
1768 // If there are no inactive lanes.
1769 if (isAllActivePredicate(OpPredicate)) {
1770 if (IInfo.hasOperandWithNoActiveLanes()) {
1771 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1772 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1773 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1774 }
1775
1776 if (IInfo.hasMatchingUndefIntrinsic()) {
1777 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1778 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1779 II.setCalledFunction(NewDecl);
1780 return &II;
1781 }
1782 }
1783
1784 // Operation specific simplifications.
1785 if (IInfo.hasMatchingIROpode() &&
1787 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1788
1789 return std::nullopt;
1790}
1791
1792// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1793// => (binop (pred) (from_svbool _) (from_svbool _))
1794//
1795// The above transformation eliminates a `to_svbool` in the predicate
1796// operand of bitwise operation `binop` by narrowing the vector width of
1797// the operation. For example, it would convert a `<vscale x 16 x i1>
1798// and` into a `<vscale x 4 x i1> and`. This is profitable because
1799// to_svbool must zero the new lanes during widening, whereas
1800// from_svbool is free.
1801static std::optional<Instruction *>
1803 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1804 if (!BinOp)
1805 return std::nullopt;
1806
1807 auto IntrinsicID = BinOp->getIntrinsicID();
1808 switch (IntrinsicID) {
1809 case Intrinsic::aarch64_sve_and_z:
1810 case Intrinsic::aarch64_sve_bic_z:
1811 case Intrinsic::aarch64_sve_eor_z:
1812 case Intrinsic::aarch64_sve_nand_z:
1813 case Intrinsic::aarch64_sve_nor_z:
1814 case Intrinsic::aarch64_sve_orn_z:
1815 case Intrinsic::aarch64_sve_orr_z:
1816 break;
1817 default:
1818 return std::nullopt;
1819 }
1820
1821 auto BinOpPred = BinOp->getOperand(0);
1822 auto BinOpOp1 = BinOp->getOperand(1);
1823 auto BinOpOp2 = BinOp->getOperand(2);
1824
1825 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1826 if (!PredIntr ||
1827 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1828 return std::nullopt;
1829
1830 auto PredOp = PredIntr->getOperand(0);
1831 auto PredOpTy = cast<VectorType>(PredOp->getType());
1832 if (PredOpTy != II.getType())
1833 return std::nullopt;
1834
1835 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1836 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1837 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1838 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1839 if (BinOpOp1 == BinOpOp2)
1840 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1841 else
1842 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1843 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1844
1845 auto NarrowedBinOp =
1846 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1847 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1848}
1849
1850static std::optional<Instruction *>
1852 // If the reinterpret instruction operand is a PHI Node
1853 if (isa<PHINode>(II.getArgOperand(0)))
1854 return processPhiNode(IC, II);
1855
1856 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1857 return BinOpCombine;
1858
1859 // Ignore converts to/from svcount_t.
1860 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1861 isa<TargetExtType>(II.getType()))
1862 return std::nullopt;
1863
1864 SmallVector<Instruction *, 32> CandidatesForRemoval;
1865 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1866
1867 const auto *IVTy = cast<VectorType>(II.getType());
1868
1869 // Walk the chain of conversions.
1870 while (Cursor) {
1871 // If the type of the cursor has fewer lanes than the final result, zeroing
1872 // must take place, which breaks the equivalence chain.
1873 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1874 if (CursorVTy->getElementCount().getKnownMinValue() <
1875 IVTy->getElementCount().getKnownMinValue())
1876 break;
1877
1878 // If the cursor has the same type as I, it is a viable replacement.
1879 if (Cursor->getType() == IVTy)
1880 EarliestReplacement = Cursor;
1881
1882 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1883
1884 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1885 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1886 Intrinsic::aarch64_sve_convert_to_svbool ||
1887 IntrinsicCursor->getIntrinsicID() ==
1888 Intrinsic::aarch64_sve_convert_from_svbool))
1889 break;
1890
1891 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1892 Cursor = IntrinsicCursor->getOperand(0);
1893 }
1894
1895 // If no viable replacement in the conversion chain was found, there is
1896 // nothing to do.
1897 if (!EarliestReplacement)
1898 return std::nullopt;
1899
1900 return IC.replaceInstUsesWith(II, EarliestReplacement);
1901}
1902
1903static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1904 IntrinsicInst &II) {
1905 // svsel(ptrue, x, y) => x
1906 auto *OpPredicate = II.getOperand(0);
1907 if (isAllActivePredicate(OpPredicate))
1908 return IC.replaceInstUsesWith(II, II.getOperand(1));
1909
1910 auto Select =
1911 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1912 return IC.replaceInstUsesWith(II, Select);
1913}
1914
1915static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1916 IntrinsicInst &II) {
1917 Value *Pg = II.getOperand(1);
1918
1919 // sve.dup(V, all_active, X) ==> splat(X)
1920 if (isAllActivePredicate(Pg)) {
1921 auto *RetTy = cast<ScalableVectorType>(II.getType());
1922 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1923 II.getArgOperand(2));
1924 return IC.replaceInstUsesWith(II, Splat);
1925 }
1926
1928 m_SpecificInt(AArch64SVEPredPattern::vl1))))
1929 return std::nullopt;
1930
1931 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
1932 Value *Insert = IC.Builder.CreateInsertElement(
1933 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
1934 return IC.replaceInstUsesWith(II, Insert);
1935}
1936
1937static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1938 IntrinsicInst &II) {
1939 // Replace DupX with a regular IR splat.
1940 auto *RetTy = cast<ScalableVectorType>(II.getType());
1941 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1942 II.getArgOperand(0));
1943 Splat->takeName(&II);
1944 return IC.replaceInstUsesWith(II, Splat);
1945}
1946
1947static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1948 IntrinsicInst &II) {
1949 LLVMContext &Ctx = II.getContext();
1950
1951 if (!isAllActivePredicate(II.getArgOperand(0)))
1952 return std::nullopt;
1953
1954 // Check that we have a compare of zero..
1955 auto *SplatValue =
1957 if (!SplatValue || !SplatValue->isZero())
1958 return std::nullopt;
1959
1960 // ..against a dupq
1961 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1962 if (!DupQLane ||
1963 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1964 return std::nullopt;
1965
1966 // Where the dupq is a lane 0 replicate of a vector insert
1967 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1968 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1969 return std::nullopt;
1970
1971 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1972 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1973 return std::nullopt;
1974
1975 // Where the vector insert is a fixed constant vector insert into undef at
1976 // index zero
1977 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1978 return std::nullopt;
1979
1980 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1981 return std::nullopt;
1982
1983 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1984 if (!ConstVec)
1985 return std::nullopt;
1986
1987 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1988 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1989 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1990 return std::nullopt;
1991
1992 unsigned NumElts = VecTy->getNumElements();
1993 unsigned PredicateBits = 0;
1994
1995 // Expand intrinsic operands to a 16-bit byte level predicate
1996 for (unsigned I = 0; I < NumElts; ++I) {
1997 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1998 if (!Arg)
1999 return std::nullopt;
2000 if (!Arg->isZero())
2001 PredicateBits |= 1 << (I * (16 / NumElts));
2002 }
2003
2004 // If all bits are zero bail early with an empty predicate
2005 if (PredicateBits == 0) {
2006 auto *PFalse = Constant::getNullValue(II.getType());
2007 PFalse->takeName(&II);
2008 return IC.replaceInstUsesWith(II, PFalse);
2009 }
2010
2011 // Calculate largest predicate type used (where byte predicate is largest)
2012 unsigned Mask = 8;
2013 for (unsigned I = 0; I < 16; ++I)
2014 if ((PredicateBits & (1 << I)) != 0)
2015 Mask |= (I % 8);
2016
2017 unsigned PredSize = Mask & -Mask;
2018 auto *PredType = ScalableVectorType::get(
2019 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2020
2021 // Ensure all relevant bits are set
2022 for (unsigned I = 0; I < 16; I += PredSize)
2023 if ((PredicateBits & (1 << I)) == 0)
2024 return std::nullopt;
2025
2026 auto *PTruePat =
2027 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2028 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2029 {PredType}, {PTruePat});
2030 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2031 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2032 auto *ConvertFromSVBool =
2033 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2034 {II.getType()}, {ConvertToSVBool});
2035
2036 ConvertFromSVBool->takeName(&II);
2037 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2038}
2039
2040static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2041 IntrinsicInst &II) {
2042 Value *Pg = II.getArgOperand(0);
2043 Value *Vec = II.getArgOperand(1);
2044 auto IntrinsicID = II.getIntrinsicID();
2045 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2046
2047 // lastX(splat(X)) --> X
2048 if (auto *SplatVal = getSplatValue(Vec))
2049 return IC.replaceInstUsesWith(II, SplatVal);
2050
2051 // If x and/or y is a splat value then:
2052 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2053 Value *LHS, *RHS;
2054 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2055 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2056 auto *OldBinOp = cast<BinaryOperator>(Vec);
2057 auto OpC = OldBinOp->getOpcode();
2058 auto *NewLHS =
2059 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2060 auto *NewRHS =
2061 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2063 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2064 return IC.replaceInstUsesWith(II, NewBinOp);
2065 }
2066 }
2067
2068 auto *C = dyn_cast<Constant>(Pg);
2069 if (IsAfter && C && C->isNullValue()) {
2070 // The intrinsic is extracting lane 0 so use an extract instead.
2071 auto *IdxTy = Type::getInt64Ty(II.getContext());
2072 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2073 Extract->insertBefore(II.getIterator());
2074 Extract->takeName(&II);
2075 return IC.replaceInstUsesWith(II, Extract);
2076 }
2077
2078 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2079 if (!IntrPG)
2080 return std::nullopt;
2081
2082 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2083 return std::nullopt;
2084
2085 const auto PTruePattern =
2086 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2087
2088 // Can the intrinsic's predicate be converted to a known constant index?
2089 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2090 if (!MinNumElts)
2091 return std::nullopt;
2092
2093 unsigned Idx = MinNumElts - 1;
2094 // Increment the index if extracting the element after the last active
2095 // predicate element.
2096 if (IsAfter)
2097 ++Idx;
2098
2099 // Ignore extracts whose index is larger than the known minimum vector
2100 // length. NOTE: This is an artificial constraint where we prefer to
2101 // maintain what the user asked for until an alternative is proven faster.
2102 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2103 if (Idx >= PgVTy->getMinNumElements())
2104 return std::nullopt;
2105
2106 // The intrinsic is extracting a fixed lane so use an extract instead.
2107 auto *IdxTy = Type::getInt64Ty(II.getContext());
2108 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2109 Extract->insertBefore(II.getIterator());
2110 Extract->takeName(&II);
2111 return IC.replaceInstUsesWith(II, Extract);
2112}
2113
2114static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2115 IntrinsicInst &II) {
2116 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2117 // integer variant across a variety of micro-architectures. Replace scalar
2118 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2119 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2120 // depending on the micro-architecture, but has been observed as generally
2121 // being faster, particularly when the CLAST[AB] op is a loop-carried
2122 // dependency.
2123 Value *Pg = II.getArgOperand(0);
2124 Value *Fallback = II.getArgOperand(1);
2125 Value *Vec = II.getArgOperand(2);
2126 Type *Ty = II.getType();
2127
2128 if (!Ty->isIntegerTy())
2129 return std::nullopt;
2130
2131 Type *FPTy;
2132 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2133 default:
2134 return std::nullopt;
2135 case 16:
2136 FPTy = IC.Builder.getHalfTy();
2137 break;
2138 case 32:
2139 FPTy = IC.Builder.getFloatTy();
2140 break;
2141 case 64:
2142 FPTy = IC.Builder.getDoubleTy();
2143 break;
2144 }
2145
2146 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2147 auto *FPVTy = VectorType::get(
2148 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2149 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2150 auto *FPII = IC.Builder.CreateIntrinsic(
2151 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2152 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2153 return IC.replaceInstUsesWith(II, FPIItoInt);
2154}
2155
2156static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2157 IntrinsicInst &II) {
2158 LLVMContext &Ctx = II.getContext();
2159 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2160 // can work with RDFFR_PP for ptest elimination.
2161 auto *AllPat =
2162 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2163 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2164 {II.getType()}, {AllPat});
2165 auto *RDFFR =
2166 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2167 RDFFR->takeName(&II);
2168 return IC.replaceInstUsesWith(II, RDFFR);
2169}
2170
2171static std::optional<Instruction *>
2173 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2174
2175 if (Pattern == AArch64SVEPredPattern::all) {
2177 II.getType(), ElementCount::getScalable(NumElts));
2178 Cnt->takeName(&II);
2179 return IC.replaceInstUsesWith(II, Cnt);
2180 }
2181
2182 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2183
2184 return MinNumElts && NumElts >= MinNumElts
2185 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2186 II, ConstantInt::get(II.getType(), MinNumElts)))
2187 : std::nullopt;
2188}
2189
2190static std::optional<Instruction *>
2192 const AArch64Subtarget *ST) {
2193 if (!ST->isStreaming())
2194 return std::nullopt;
2195
2196 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2197 // with SVEPredPattern::all
2198 Value *Cnt =
2200 Cnt->takeName(&II);
2201 return IC.replaceInstUsesWith(II, Cnt);
2202}
2203
2204static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2205 IntrinsicInst &II) {
2206 Value *PgVal = II.getArgOperand(0);
2207 Value *OpVal = II.getArgOperand(1);
2208
2209 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2210 // Later optimizations prefer this form.
2211 if (PgVal == OpVal &&
2212 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2213 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2214 Value *Ops[] = {PgVal, OpVal};
2215 Type *Tys[] = {PgVal->getType()};
2216
2217 auto *PTest =
2218 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2219 PTest->takeName(&II);
2220
2221 return IC.replaceInstUsesWith(II, PTest);
2222 }
2223
2226
2227 if (!Pg || !Op)
2228 return std::nullopt;
2229
2230 Intrinsic::ID OpIID = Op->getIntrinsicID();
2231
2232 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2233 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2234 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2235 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2236 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2237
2238 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2239
2240 PTest->takeName(&II);
2241 return IC.replaceInstUsesWith(II, PTest);
2242 }
2243
2244 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2245 // Later optimizations may rewrite sequence to use the flag-setting variant
2246 // of instruction X to remove PTEST.
2247 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2248 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2249 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2250 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2251 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2252 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2253 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2254 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2255 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2256 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2257 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2258 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2259 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2260 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2261 Type *Tys[] = {Pg->getType()};
2262
2263 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2264 PTest->takeName(&II);
2265
2266 return IC.replaceInstUsesWith(II, PTest);
2267 }
2268
2269 return std::nullopt;
2270}
2271
2272template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2273static std::optional<Instruction *>
2275 bool MergeIntoAddendOp) {
2276 Value *P = II.getOperand(0);
2277 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2278 if (MergeIntoAddendOp) {
2279 AddendOp = II.getOperand(1);
2280 Mul = II.getOperand(2);
2281 } else {
2282 AddendOp = II.getOperand(2);
2283 Mul = II.getOperand(1);
2284 }
2285
2287 m_Value(MulOp1))))
2288 return std::nullopt;
2289
2290 if (!Mul->hasOneUse())
2291 return std::nullopt;
2292
2293 Instruction *FMFSource = nullptr;
2294 if (II.getType()->isFPOrFPVectorTy()) {
2295 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2296 // Stop the combine when the flags on the inputs differ in case dropping
2297 // flags would lead to us missing out on more beneficial optimizations.
2298 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2299 return std::nullopt;
2300 if (!FAddFlags.allowContract())
2301 return std::nullopt;
2302 FMFSource = &II;
2303 }
2304
2305 CallInst *Res;
2306 if (MergeIntoAddendOp)
2307 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2308 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2309 else
2310 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2311 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2312
2313 return IC.replaceInstUsesWith(II, Res);
2314}
2315
2316static std::optional<Instruction *>
2318 Value *Pred = II.getOperand(0);
2319 Value *PtrOp = II.getOperand(1);
2320 Type *VecTy = II.getType();
2321
2322 if (isAllActivePredicate(Pred)) {
2323 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2324 Load->copyMetadata(II);
2325 return IC.replaceInstUsesWith(II, Load);
2326 }
2327
2328 CallInst *MaskedLoad =
2329 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2330 Pred, ConstantAggregateZero::get(VecTy));
2331 MaskedLoad->copyMetadata(II);
2332 return IC.replaceInstUsesWith(II, MaskedLoad);
2333}
2334
2335static std::optional<Instruction *>
2337 Value *VecOp = II.getOperand(0);
2338 Value *Pred = II.getOperand(1);
2339 Value *PtrOp = II.getOperand(2);
2340
2341 if (isAllActivePredicate(Pred)) {
2342 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2343 Store->copyMetadata(II);
2344 return IC.eraseInstFromFunction(II);
2345 }
2346
2347 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2348 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2349 MaskedStore->copyMetadata(II);
2350 return IC.eraseInstFromFunction(II);
2351}
2352
2354 switch (Intrinsic) {
2355 case Intrinsic::aarch64_sve_fmul_u:
2356 return Instruction::BinaryOps::FMul;
2357 case Intrinsic::aarch64_sve_fadd_u:
2358 return Instruction::BinaryOps::FAdd;
2359 case Intrinsic::aarch64_sve_fsub_u:
2360 return Instruction::BinaryOps::FSub;
2361 default:
2362 return Instruction::BinaryOpsEnd;
2363 }
2364}
2365
2366static std::optional<Instruction *>
2368 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2369 if (II.isStrictFP())
2370 return std::nullopt;
2371
2372 auto *OpPredicate = II.getOperand(0);
2373 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2374 if (BinOpCode == Instruction::BinaryOpsEnd ||
2375 !isAllActivePredicate(OpPredicate))
2376 return std::nullopt;
2377 auto BinOp = IC.Builder.CreateBinOpFMF(
2378 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2379 return IC.replaceInstUsesWith(II, BinOp);
2380}
2381
2382static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2383 IntrinsicInst &II) {
2384 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2385 Intrinsic::aarch64_sve_mla>(
2386 IC, II, true))
2387 return MLA;
2388 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2389 Intrinsic::aarch64_sve_mad>(
2390 IC, II, false))
2391 return MAD;
2392 return std::nullopt;
2393}
2394
2395static std::optional<Instruction *>
2397 if (auto FMLA =
2398 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2399 Intrinsic::aarch64_sve_fmla>(IC, II,
2400 true))
2401 return FMLA;
2402 if (auto FMAD =
2403 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2404 Intrinsic::aarch64_sve_fmad>(IC, II,
2405 false))
2406 return FMAD;
2407 if (auto FMLA =
2408 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2409 Intrinsic::aarch64_sve_fmla>(IC, II,
2410 true))
2411 return FMLA;
2412 return std::nullopt;
2413}
2414
2415static std::optional<Instruction *>
2417 if (auto FMLA =
2418 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2419 Intrinsic::aarch64_sve_fmla>(IC, II,
2420 true))
2421 return FMLA;
2422 if (auto FMAD =
2423 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2424 Intrinsic::aarch64_sve_fmad>(IC, II,
2425 false))
2426 return FMAD;
2427 if (auto FMLA_U =
2428 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2429 Intrinsic::aarch64_sve_fmla_u>(
2430 IC, II, true))
2431 return FMLA_U;
2432 return instCombineSVEVectorBinOp(IC, II);
2433}
2434
2435static std::optional<Instruction *>
2437 if (auto FMLS =
2438 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2439 Intrinsic::aarch64_sve_fmls>(IC, II,
2440 true))
2441 return FMLS;
2442 if (auto FMSB =
2443 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2444 Intrinsic::aarch64_sve_fnmsb>(
2445 IC, II, false))
2446 return FMSB;
2447 if (auto FMLS =
2448 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2449 Intrinsic::aarch64_sve_fmls>(IC, II,
2450 true))
2451 return FMLS;
2452 return std::nullopt;
2453}
2454
2455static std::optional<Instruction *>
2457 if (auto FMLS =
2458 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2459 Intrinsic::aarch64_sve_fmls>(IC, II,
2460 true))
2461 return FMLS;
2462 if (auto FMSB =
2463 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2464 Intrinsic::aarch64_sve_fnmsb>(
2465 IC, II, false))
2466 return FMSB;
2467 if (auto FMLS_U =
2468 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2469 Intrinsic::aarch64_sve_fmls_u>(
2470 IC, II, true))
2471 return FMLS_U;
2472 return instCombineSVEVectorBinOp(IC, II);
2473}
2474
2475static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2476 IntrinsicInst &II) {
2477 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2478 Intrinsic::aarch64_sve_mls>(
2479 IC, II, true))
2480 return MLS;
2481 return std::nullopt;
2482}
2483
2484static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2485 IntrinsicInst &II) {
2486 Value *UnpackArg = II.getArgOperand(0);
2487 auto *RetTy = cast<ScalableVectorType>(II.getType());
2488 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2489 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2490
2491 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2492 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2493 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2494 ScalarArg =
2495 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2496 Value *NewVal =
2497 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2498 NewVal->takeName(&II);
2499 return IC.replaceInstUsesWith(II, NewVal);
2500 }
2501
2502 return std::nullopt;
2503}
2504static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2505 IntrinsicInst &II) {
2506 auto *OpVal = II.getOperand(0);
2507 auto *OpIndices = II.getOperand(1);
2508 VectorType *VTy = cast<VectorType>(II.getType());
2509
2510 // Check whether OpIndices is a constant splat value < minimal element count
2511 // of result.
2512 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2513 if (!SplatValue ||
2514 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2515 return std::nullopt;
2516
2517 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2518 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2519 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2520 auto *VectorSplat =
2521 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2522
2523 VectorSplat->takeName(&II);
2524 return IC.replaceInstUsesWith(II, VectorSplat);
2525}
2526
2527static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2528 IntrinsicInst &II) {
2529 Value *A, *B;
2530 Type *RetTy = II.getType();
2531 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2532 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2533
2534 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2535 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2536 if ((match(II.getArgOperand(0),
2538 match(II.getArgOperand(1),
2540 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2541 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2542 auto *TyA = cast<ScalableVectorType>(A->getType());
2543 if (TyA == B->getType() &&
2545 auto *SubVec = IC.Builder.CreateInsertVector(
2546 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2547 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2548 TyA->getMinNumElements());
2549 ConcatVec->takeName(&II);
2550 return IC.replaceInstUsesWith(II, ConcatVec);
2551 }
2552 }
2553
2554 return std::nullopt;
2555}
2556
2557static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2558 IntrinsicInst &II) {
2559 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2560 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2561 Value *A, *B;
2562 if (match(II.getArgOperand(0),
2565 m_Specific(A), m_Specific(B))))
2566 return IC.replaceInstUsesWith(
2567 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2568
2569 return std::nullopt;
2570}
2571
2572static std::optional<Instruction *>
2574 Value *Mask = II.getOperand(0);
2575 Value *BasePtr = II.getOperand(1);
2576 Value *Index = II.getOperand(2);
2577 Type *Ty = II.getType();
2578 Value *PassThru = ConstantAggregateZero::get(Ty);
2579
2580 // Contiguous gather => masked load.
2581 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2582 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2583 Value *IndexBase;
2585 m_Value(IndexBase), m_SpecificInt(1)))) {
2586 Align Alignment =
2587 BasePtr->getPointerAlignment(II.getDataLayout());
2588
2589 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2590 BasePtr, IndexBase);
2591 CallInst *MaskedLoad =
2592 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2593 MaskedLoad->takeName(&II);
2594 return IC.replaceInstUsesWith(II, MaskedLoad);
2595 }
2596
2597 return std::nullopt;
2598}
2599
2600static std::optional<Instruction *>
2602 Value *Val = II.getOperand(0);
2603 Value *Mask = II.getOperand(1);
2604 Value *BasePtr = II.getOperand(2);
2605 Value *Index = II.getOperand(3);
2606 Type *Ty = Val->getType();
2607
2608 // Contiguous scatter => masked store.
2609 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2610 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2611 Value *IndexBase;
2613 m_Value(IndexBase), m_SpecificInt(1)))) {
2614 Align Alignment =
2615 BasePtr->getPointerAlignment(II.getDataLayout());
2616
2617 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2618 BasePtr, IndexBase);
2619 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2620
2621 return IC.eraseInstFromFunction(II);
2622 }
2623
2624 return std::nullopt;
2625}
2626
2627static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2628 IntrinsicInst &II) {
2630 Value *Pred = II.getOperand(0);
2631 Value *Vec = II.getOperand(1);
2632 Value *DivVec = II.getOperand(2);
2633
2634 Value *SplatValue = getSplatValue(DivVec);
2635 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2636 if (!SplatConstantInt)
2637 return std::nullopt;
2638
2639 APInt Divisor = SplatConstantInt->getValue();
2640 const int64_t DivisorValue = Divisor.getSExtValue();
2641 if (DivisorValue == -1)
2642 return std::nullopt;
2643 if (DivisorValue == 1)
2644 IC.replaceInstUsesWith(II, Vec);
2645
2646 if (Divisor.isPowerOf2()) {
2647 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2648 auto ASRD = IC.Builder.CreateIntrinsic(
2649 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2650 return IC.replaceInstUsesWith(II, ASRD);
2651 }
2652 if (Divisor.isNegatedPowerOf2()) {
2653 Divisor.negate();
2654 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2655 auto ASRD = IC.Builder.CreateIntrinsic(
2656 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2657 auto NEG = IC.Builder.CreateIntrinsic(
2658 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2659 return IC.replaceInstUsesWith(II, NEG);
2660 }
2661
2662 return std::nullopt;
2663}
2664
2665bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2666 size_t VecSize = Vec.size();
2667 if (VecSize == 1)
2668 return true;
2669 if (!isPowerOf2_64(VecSize))
2670 return false;
2671 size_t HalfVecSize = VecSize / 2;
2672
2673 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2674 RHS != Vec.end(); LHS++, RHS++) {
2675 if (*LHS != nullptr && *RHS != nullptr) {
2676 if (*LHS == *RHS)
2677 continue;
2678 else
2679 return false;
2680 }
2681 if (!AllowPoison)
2682 return false;
2683 if (*LHS == nullptr && *RHS != nullptr)
2684 *LHS = *RHS;
2685 }
2686
2687 Vec.resize(HalfVecSize);
2688 SimplifyValuePattern(Vec, AllowPoison);
2689 return true;
2690}
2691
2692// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2693// to dupqlane(f64(C)) where C is A concatenated with B
2694static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2695 IntrinsicInst &II) {
2696 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2697 if (!match(II.getOperand(0),
2699 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2700 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2701 return std::nullopt;
2702 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2703
2704 // Insert the scalars into a container ordered by InsertElement index
2705 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2706 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2707 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2708 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2709 CurrentInsertElt = InsertElt->getOperand(0);
2710 }
2711
2712 bool AllowPoison =
2713 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2714 if (!SimplifyValuePattern(Elts, AllowPoison))
2715 return std::nullopt;
2716
2717 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2718 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2719 for (size_t I = 0; I < Elts.size(); I++) {
2720 if (Elts[I] == nullptr)
2721 continue;
2722 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2723 IC.Builder.getInt64(I));
2724 }
2725 if (InsertEltChain == nullptr)
2726 return std::nullopt;
2727
2728 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2729 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2730 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2731 // be narrowed back to the original type.
2732 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2733 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2734 IIScalableTy->getMinNumElements() /
2735 PatternWidth;
2736
2737 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2738 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2739 auto *WideShuffleMaskTy =
2740 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2741
2742 auto InsertSubvector = IC.Builder.CreateInsertVector(
2743 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2744 uint64_t(0));
2745 auto WideBitcast =
2746 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2747 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2748 auto WideShuffle = IC.Builder.CreateShuffleVector(
2749 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2750 auto NarrowBitcast =
2751 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2752
2753 return IC.replaceInstUsesWith(II, NarrowBitcast);
2754}
2755
2756static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2757 IntrinsicInst &II) {
2758 Value *A = II.getArgOperand(0);
2759 Value *B = II.getArgOperand(1);
2760 if (A == B)
2761 return IC.replaceInstUsesWith(II, A);
2762
2763 return std::nullopt;
2764}
2765
2766static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2767 IntrinsicInst &II) {
2768 Value *Pred = II.getOperand(0);
2769 Value *Vec = II.getOperand(1);
2770 Value *Shift = II.getOperand(2);
2771
2772 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2773 Value *AbsPred, *MergedValue;
2775 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2777 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2778
2779 return std::nullopt;
2780
2781 // Transform is valid if any of the following are true:
2782 // * The ABS merge value is an undef or non-negative
2783 // * The ABS predicate is all active
2784 // * The ABS predicate and the SRSHL predicates are the same
2785 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2786 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2787 return std::nullopt;
2788
2789 // Only valid when the shift amount is non-negative, otherwise the rounding
2790 // behaviour of SRSHL cannot be ignored.
2791 if (!match(Shift, m_NonNegative()))
2792 return std::nullopt;
2793
2794 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2795 {II.getType()}, {Pred, Vec, Shift});
2796
2797 return IC.replaceInstUsesWith(II, LSL);
2798}
2799
2800static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2801 IntrinsicInst &II) {
2802 Value *Vec = II.getOperand(0);
2803
2804 if (getSplatValue(Vec) == II.getOperand(1))
2805 return IC.replaceInstUsesWith(II, Vec);
2806
2807 return std::nullopt;
2808}
2809
2810static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2811 IntrinsicInst &II) {
2812 // If this barrier is post-dominated by identical one we can remove it
2813 auto *NI = II.getNextNode();
2814 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2815 auto CanSkipOver = [](Instruction *I) {
2816 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2817 };
2818 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2819 auto *NIBB = NI->getParent();
2820 NI = NI->getNextNode();
2821 if (!NI) {
2822 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2823 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2824 else
2825 break;
2826 }
2827 }
2828 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2829 if (NextII && II.isIdenticalTo(NextII))
2830 return IC.eraseInstFromFunction(II);
2831
2832 return std::nullopt;
2833}
2834
2835static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2836 IntrinsicInst &II) {
2837 return IC.replaceInstUsesWith(
2838 II,
2839 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2840 {II.getType(), II.getOperand(0)->getType()},
2841 {II.getOperand(0), II.getOperand(1)}));
2842}
2843
2844static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2845 IntrinsicInst &II) {
2847 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2848 return std::nullopt;
2849}
2850
2851static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2853 unsigned NumBits) {
2854 Value *Passthru = II.getOperand(0);
2855 Value *Pg = II.getOperand(1);
2856 Value *Op = II.getOperand(2);
2857
2858 // Convert UXT[BHW] to AND.
2859 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2860 auto *Ty = cast<VectorType>(II.getType());
2861 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2862 auto *Mask = ConstantInt::get(Ty, MaskValue);
2863 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2864 {Pg, Op, Mask});
2865 return IC.replaceInstUsesWith(II, And);
2866 }
2867
2868 return std::nullopt;
2869}
2870
2871static std::optional<Instruction *>
2873 SMEAttrs FnSMEAttrs(*II.getFunction());
2874 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2875 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2876 return IC.replaceInstUsesWith(
2877 II, ConstantInt::getBool(II.getType(), IsStreaming));
2878 return std::nullopt;
2879}
2880
2881std::optional<Instruction *>
2883 IntrinsicInst &II) const {
2885 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2886 return I;
2887
2888 Intrinsic::ID IID = II.getIntrinsicID();
2889 switch (IID) {
2890 default:
2891 break;
2892 case Intrinsic::aarch64_dmb:
2893 return instCombineDMB(IC, II);
2894 case Intrinsic::aarch64_neon_fmaxnm:
2895 case Intrinsic::aarch64_neon_fminnm:
2896 return instCombineMaxMinNM(IC, II);
2897 case Intrinsic::aarch64_sve_convert_from_svbool:
2898 return instCombineConvertFromSVBool(IC, II);
2899 case Intrinsic::aarch64_sve_dup:
2900 return instCombineSVEDup(IC, II);
2901 case Intrinsic::aarch64_sve_dup_x:
2902 return instCombineSVEDupX(IC, II);
2903 case Intrinsic::aarch64_sve_cmpne:
2904 case Intrinsic::aarch64_sve_cmpne_wide:
2905 return instCombineSVECmpNE(IC, II);
2906 case Intrinsic::aarch64_sve_rdffr:
2907 return instCombineRDFFR(IC, II);
2908 case Intrinsic::aarch64_sve_lasta:
2909 case Intrinsic::aarch64_sve_lastb:
2910 return instCombineSVELast(IC, II);
2911 case Intrinsic::aarch64_sve_clasta_n:
2912 case Intrinsic::aarch64_sve_clastb_n:
2913 return instCombineSVECondLast(IC, II);
2914 case Intrinsic::aarch64_sve_cntd:
2915 return instCombineSVECntElts(IC, II, 2);
2916 case Intrinsic::aarch64_sve_cntw:
2917 return instCombineSVECntElts(IC, II, 4);
2918 case Intrinsic::aarch64_sve_cnth:
2919 return instCombineSVECntElts(IC, II, 8);
2920 case Intrinsic::aarch64_sve_cntb:
2921 return instCombineSVECntElts(IC, II, 16);
2922 case Intrinsic::aarch64_sme_cntsd:
2923 return instCombineSMECntsd(IC, II, ST);
2924 case Intrinsic::aarch64_sve_ptest_any:
2925 case Intrinsic::aarch64_sve_ptest_first:
2926 case Intrinsic::aarch64_sve_ptest_last:
2927 return instCombineSVEPTest(IC, II);
2928 case Intrinsic::aarch64_sve_fadd:
2929 return instCombineSVEVectorFAdd(IC, II);
2930 case Intrinsic::aarch64_sve_fadd_u:
2931 return instCombineSVEVectorFAddU(IC, II);
2932 case Intrinsic::aarch64_sve_fmul_u:
2933 return instCombineSVEVectorBinOp(IC, II);
2934 case Intrinsic::aarch64_sve_fsub:
2935 return instCombineSVEVectorFSub(IC, II);
2936 case Intrinsic::aarch64_sve_fsub_u:
2937 return instCombineSVEVectorFSubU(IC, II);
2938 case Intrinsic::aarch64_sve_add:
2939 return instCombineSVEVectorAdd(IC, II);
2940 case Intrinsic::aarch64_sve_add_u:
2941 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2942 Intrinsic::aarch64_sve_mla_u>(
2943 IC, II, true);
2944 case Intrinsic::aarch64_sve_sub:
2945 return instCombineSVEVectorSub(IC, II);
2946 case Intrinsic::aarch64_sve_sub_u:
2947 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2948 Intrinsic::aarch64_sve_mls_u>(
2949 IC, II, true);
2950 case Intrinsic::aarch64_sve_tbl:
2951 return instCombineSVETBL(IC, II);
2952 case Intrinsic::aarch64_sve_uunpkhi:
2953 case Intrinsic::aarch64_sve_uunpklo:
2954 case Intrinsic::aarch64_sve_sunpkhi:
2955 case Intrinsic::aarch64_sve_sunpklo:
2956 return instCombineSVEUnpack(IC, II);
2957 case Intrinsic::aarch64_sve_uzp1:
2958 return instCombineSVEUzp1(IC, II);
2959 case Intrinsic::aarch64_sve_zip1:
2960 case Intrinsic::aarch64_sve_zip2:
2961 return instCombineSVEZip(IC, II);
2962 case Intrinsic::aarch64_sve_ld1_gather_index:
2963 return instCombineLD1GatherIndex(IC, II);
2964 case Intrinsic::aarch64_sve_st1_scatter_index:
2965 return instCombineST1ScatterIndex(IC, II);
2966 case Intrinsic::aarch64_sve_ld1:
2967 return instCombineSVELD1(IC, II, DL);
2968 case Intrinsic::aarch64_sve_st1:
2969 return instCombineSVEST1(IC, II, DL);
2970 case Intrinsic::aarch64_sve_sdiv:
2971 return instCombineSVESDIV(IC, II);
2972 case Intrinsic::aarch64_sve_sel:
2973 return instCombineSVESel(IC, II);
2974 case Intrinsic::aarch64_sve_srshl:
2975 return instCombineSVESrshl(IC, II);
2976 case Intrinsic::aarch64_sve_dupq_lane:
2977 return instCombineSVEDupqLane(IC, II);
2978 case Intrinsic::aarch64_sve_insr:
2979 return instCombineSVEInsr(IC, II);
2980 case Intrinsic::aarch64_sve_whilelo:
2981 return instCombineWhilelo(IC, II);
2982 case Intrinsic::aarch64_sve_ptrue:
2983 return instCombinePTrue(IC, II);
2984 case Intrinsic::aarch64_sve_uxtb:
2985 return instCombineSVEUxt(IC, II, 8);
2986 case Intrinsic::aarch64_sve_uxth:
2987 return instCombineSVEUxt(IC, II, 16);
2988 case Intrinsic::aarch64_sve_uxtw:
2989 return instCombineSVEUxt(IC, II, 32);
2990 case Intrinsic::aarch64_sme_in_streaming_mode:
2991 return instCombineInStreamingMode(IC, II);
2992 }
2993
2994 return std::nullopt;
2995}
2996
2998 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2999 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3000 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3001 SimplifyAndSetOp) const {
3002 switch (II.getIntrinsicID()) {
3003 default:
3004 break;
3005 case Intrinsic::aarch64_neon_fcvtxn:
3006 case Intrinsic::aarch64_neon_rshrn:
3007 case Intrinsic::aarch64_neon_sqrshrn:
3008 case Intrinsic::aarch64_neon_sqrshrun:
3009 case Intrinsic::aarch64_neon_sqshrn:
3010 case Intrinsic::aarch64_neon_sqshrun:
3011 case Intrinsic::aarch64_neon_sqxtn:
3012 case Intrinsic::aarch64_neon_sqxtun:
3013 case Intrinsic::aarch64_neon_uqrshrn:
3014 case Intrinsic::aarch64_neon_uqshrn:
3015 case Intrinsic::aarch64_neon_uqxtn:
3016 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3017 break;
3018 }
3019
3020 return std::nullopt;
3021}
3022
3024 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3026}
3027
3030 switch (K) {
3032 return TypeSize::getFixed(64);
3034 if (ST->useSVEForFixedLengthVectors() &&
3035 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3036 return TypeSize::getFixed(
3037 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3038 else if (ST->isNeonAvailable())
3039 return TypeSize::getFixed(128);
3040 else
3041 return TypeSize::getFixed(0);
3043 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3045 return TypeSize::getScalable(128);
3046 else
3047 return TypeSize::getScalable(0);
3048 }
3049 llvm_unreachable("Unsupported register kind");
3050}
3051
3052bool AArch64TTIImpl::isSingleExtWideningInstruction(
3053 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3054 Type *SrcOverrideTy) const {
3055 // A helper that returns a vector type from the given type. The number of
3056 // elements in type Ty determines the vector width.
3057 auto toVectorTy = [&](Type *ArgTy) {
3058 return VectorType::get(ArgTy->getScalarType(),
3059 cast<VectorType>(DstTy)->getElementCount());
3060 };
3061
3062 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3063 // i32, i64]. SVE doesn't generally have the same set of instructions to
3064 // perform an extend with the add/sub/mul. There are SMULLB style
3065 // instructions, but they operate on top/bottom, requiring some sort of lane
3066 // interleaving to be used with zext/sext.
3067 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3068 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3069 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3070 return false;
3071
3072 Type *SrcTy = SrcOverrideTy;
3073 switch (Opcode) {
3074 case Instruction::Add: // UADDW(2), SADDW(2).
3075 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3076 // The second operand needs to be an extend
3077 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3078 if (!SrcTy)
3079 SrcTy =
3080 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3081 break;
3082 }
3083
3084 if (Opcode == Instruction::Sub)
3085 return false;
3086
3087 // UADDW(2), SADDW(2) can be commutted.
3088 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3089 if (!SrcTy)
3090 SrcTy =
3091 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3092 break;
3093 }
3094 return false;
3095 }
3096 default:
3097 return false;
3098 }
3099
3100 // Legalize the destination type and ensure it can be used in a widening
3101 // operation.
3102 auto DstTyL = getTypeLegalizationCost(DstTy);
3103 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3104 return false;
3105
3106 // Legalize the source type and ensure it can be used in a widening
3107 // operation.
3108 assert(SrcTy && "Expected some SrcTy");
3109 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3110 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3111 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3112 return false;
3113
3114 // Get the total number of vector elements in the legalized types.
3115 InstructionCost NumDstEls =
3116 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3117 InstructionCost NumSrcEls =
3118 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3119
3120 // Return true if the legalized types have the same number of vector elements
3121 // and the destination element type size is twice that of the source type.
3122 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3123}
3124
3125Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3127 Type *SrcOverrideTy) const {
3128 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3129 Opcode != Instruction::Mul)
3130 return nullptr;
3131
3132 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3133 // i32, i64]. SVE doesn't generally have the same set of instructions to
3134 // perform an extend with the add/sub/mul. There are SMULLB style
3135 // instructions, but they operate on top/bottom, requiring some sort of lane
3136 // interleaving to be used with zext/sext.
3137 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3138 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3139 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3140 return nullptr;
3141
3142 auto getScalarSizeWithOverride = [&](const Value *V) {
3143 if (SrcOverrideTy)
3144 return SrcOverrideTy->getScalarSizeInBits();
3145 return cast<Instruction>(V)
3146 ->getOperand(0)
3147 ->getType()
3148 ->getScalarSizeInBits();
3149 };
3150
3151 unsigned MaxEltSize = 0;
3152 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3153 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3154 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3155 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3156 MaxEltSize = std::max(EltSize0, EltSize1);
3157 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3158 isa<SExtInst, ZExtInst>(Args[1])) {
3159 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3160 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3161 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3162 // enough.
3163 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3164 return nullptr;
3165 MaxEltSize = DstEltSize / 2;
3166 } else if (Opcode == Instruction::Mul &&
3167 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3168 // If one of the operands is a Zext and the other has enough zero bits
3169 // to be treated as unsigned, we can still generate a umull, meaning the
3170 // zext is free.
3171 KnownBits Known =
3172 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3173 if (Args[0]->getType()->getScalarSizeInBits() -
3174 Known.Zero.countLeadingOnes() >
3175 DstTy->getScalarSizeInBits() / 2)
3176 return nullptr;
3177
3178 MaxEltSize =
3179 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3180 } else
3181 return nullptr;
3182
3183 if (MaxEltSize * 2 > DstEltSize)
3184 return nullptr;
3185
3186 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3187 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3188 return nullptr;
3189 return ExtTy;
3190}
3191
3192// s/urhadd instructions implement the following pattern, making the
3193// extends free:
3194// %x = add ((zext i8 -> i16), 1)
3195// %y = (zext i8 -> i16)
3196// trunc i16 (lshr (add %x, %y), 1) -> i8
3197//
3199 Type *Src) const {
3200 // The source should be a legal vector type.
3201 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3202 (Src->isScalableTy() && !ST->hasSVE2()))
3203 return false;
3204
3205 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3206 return false;
3207
3208 // Look for trunc/shl/add before trying to match the pattern.
3209 const Instruction *Add = ExtUser;
3210 auto *AddUser =
3211 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3212 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3213 Add = AddUser;
3214
3215 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3216 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3217 return false;
3218
3219 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3220 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3221 Src->getScalarSizeInBits() !=
3222 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3223 return false;
3224
3225 // Try to match the whole pattern. Ext could be either the first or second
3226 // m_ZExtOrSExt matched.
3227 Instruction *Ex1, *Ex2;
3228 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3229 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3230 return false;
3231
3232 // Ensure both extends are of the same type
3233 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3234 Ex1->getOpcode() == Ex2->getOpcode())
3235 return true;
3236
3237 return false;
3238}
3239
3241 Type *Src,
3244 const Instruction *I) const {
3245 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3246 assert(ISD && "Invalid opcode");
3247 // If the cast is observable, and it is used by a widening instruction (e.g.,
3248 // uaddl, saddw, etc.), it may be free.
3249 if (I && I->hasOneUser()) {
3250 auto *SingleUser = cast<Instruction>(*I->user_begin());
3251 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3252 if (Type *ExtTy = isBinExtWideningInstruction(
3253 SingleUser->getOpcode(), Dst, Operands,
3254 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3255 // The cost from Src->Src*2 needs to be added if required, the cost from
3256 // Src*2->ExtTy is free.
3257 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3258 Type *DoubleSrcTy =
3259 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3260 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3262 }
3263
3264 return 0;
3265 }
3266
3267 if (isSingleExtWideningInstruction(
3268 SingleUser->getOpcode(), Dst, Operands,
3269 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3270 // For adds only count the second operand as free if both operands are
3271 // extends but not the same operation. (i.e both operands are not free in
3272 // add(sext, zext)).
3273 if (SingleUser->getOpcode() == Instruction::Add) {
3274 if (I == SingleUser->getOperand(1) ||
3275 (isa<CastInst>(SingleUser->getOperand(1)) &&
3276 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3277 return 0;
3278 } else {
3279 // Others are free so long as isSingleExtWideningInstruction
3280 // returned true.
3281 return 0;
3282 }
3283 }
3284
3285 // The cast will be free for the s/urhadd instructions
3286 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3287 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3288 return 0;
3289 }
3290
3291 // TODO: Allow non-throughput costs that aren't binary.
3292 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3294 return Cost == 0 ? 0 : 1;
3295 return Cost;
3296 };
3297
3298 EVT SrcTy = TLI->getValueType(DL, Src);
3299 EVT DstTy = TLI->getValueType(DL, Dst);
3300
3301 if (!SrcTy.isSimple() || !DstTy.isSimple())
3302 return AdjustCost(
3303 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3304
3305 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3306 // we use fcvtx under SVE2. Give them invalid costs.
3307 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3308 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3309 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3311
3312 static const TypeConversionCostTblEntry BF16Tbl[] = {
3313 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3314 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3315 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3316 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3317 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3318 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3319 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3320 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3321 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3322 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3323 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3324 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3325 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3326 };
3327
3328 if (ST->hasBF16())
3329 if (const auto *Entry = ConvertCostTableLookup(
3330 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3331 return AdjustCost(Entry->Cost);
3332
3333 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3334 // The cost of unpacking twice is artificially increased for now in order
3335 // to avoid regressions against NEON, which will use tbl instructions directly
3336 // instead of multiple layers of [s|u]unpk[lo|hi].
3337 // We use the unpacks in cases where the destination type is illegal and
3338 // requires splitting of the input, even if the input type itself is legal.
3339 const unsigned int SVE_EXT_COST = 1;
3340 const unsigned int SVE_FCVT_COST = 1;
3341 const unsigned int SVE_UNPACK_ONCE = 4;
3342 const unsigned int SVE_UNPACK_TWICE = 16;
3343
3344 static const TypeConversionCostTblEntry ConversionTbl[] = {
3345 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3346 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3347 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3348 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3349 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3350 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3351 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3352 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3353 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3354 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3355 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3356 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3357 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3358 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3359 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3360 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3361 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3362 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3363 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3364 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3365
3366 // Truncations on nxvmiN
3367 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3368 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3369 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3370 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3371 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3372 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3373 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3374 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3375 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3376 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3377 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3378 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3379 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3380 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3381 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3382 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3383 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3384 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3385 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3386 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3387 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3388 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3389 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3390 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3391 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3392 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3393 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3394 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3395 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3396 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3397 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3398 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3399 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3400
3401 // The number of shll instructions for the extension.
3402 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3403 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3404 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3405 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3406 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3407 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3408 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3409 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3410 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3411 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3412 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3413 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3414 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3415 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3416 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3417 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3418
3419 // FP Ext and trunc
3420 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3421 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3422 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3423 // FP16
3424 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3425 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3426 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3427 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3428 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3429 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3430 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3431 // BF16 (uses shift)
3432 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3433 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3434 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3435 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3436 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3437 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3438 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3439 // FP Ext and trunc
3440 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3441 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3442 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3443 // FP16
3444 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3445 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3446 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3447 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3448 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3449 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3450 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3451 // BF16 (more complex, with +bf16 is handled above)
3452 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3453 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3454 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3455 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3456 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3457 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3458 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3459 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3460
3461 // LowerVectorINT_TO_FP:
3462 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3463 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3464 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3465 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3466 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3467 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3468
3469 // SVE: to nxv2f16
3470 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3471 SVE_EXT_COST + SVE_FCVT_COST},
3472 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3473 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3474 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3475 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3476 SVE_EXT_COST + SVE_FCVT_COST},
3477 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3478 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3479 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3480
3481 // SVE: to nxv4f16
3482 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3483 SVE_EXT_COST + SVE_FCVT_COST},
3484 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3485 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3486 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3487 SVE_EXT_COST + SVE_FCVT_COST},
3488 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3489 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3490
3491 // SVE: to nxv8f16
3492 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3493 SVE_EXT_COST + SVE_FCVT_COST},
3494 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3495 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3496 SVE_EXT_COST + SVE_FCVT_COST},
3497 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3498
3499 // SVE: to nxv16f16
3500 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3501 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3502 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3503 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3504
3505 // Complex: to v2f32
3506 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3507 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3508 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3509 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3510
3511 // SVE: to nxv2f32
3512 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3513 SVE_EXT_COST + SVE_FCVT_COST},
3514 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3515 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3516 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3517 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3518 SVE_EXT_COST + SVE_FCVT_COST},
3519 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3520 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3521 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3522
3523 // Complex: to v4f32
3524 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3525 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3526 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3527 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3528
3529 // SVE: to nxv4f32
3530 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3531 SVE_EXT_COST + SVE_FCVT_COST},
3532 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3533 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3534 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3535 SVE_EXT_COST + SVE_FCVT_COST},
3536 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3537 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3538
3539 // Complex: to v8f32
3540 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3541 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3542 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3543 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3544
3545 // SVE: to nxv8f32
3546 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3547 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3548 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3549 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3550 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3551 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3552 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3553 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3554
3555 // SVE: to nxv16f32
3556 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3557 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3558 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3559 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3560
3561 // Complex: to v16f32
3562 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3563 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3564
3565 // Complex: to v2f64
3566 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3567 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3568 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3569 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3570 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3571 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3572
3573 // SVE: to nxv2f64
3574 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3575 SVE_EXT_COST + SVE_FCVT_COST},
3576 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3577 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3578 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3579 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3580 SVE_EXT_COST + SVE_FCVT_COST},
3581 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3582 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3583 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3584
3585 // Complex: to v4f64
3586 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3587 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3588
3589 // SVE: to nxv4f64
3590 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3591 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3592 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3593 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3594 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3595 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3596 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3597 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3598 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3599 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3600 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3601 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3602
3603 // SVE: to nxv8f64
3604 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3605 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3606 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3607 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3608 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3609 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3610 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3611 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3612
3613 // LowerVectorFP_TO_INT
3614 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3615 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3616 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3617 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3618 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3619 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3620
3621 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3622 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3623 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3624 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3625 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3626 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3627 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3628
3629 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3630 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3631 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3632 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3633 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3634
3635 // Complex, from nxv2f32.
3636 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3637 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3638 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3639 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3640 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3641 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3642 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3643 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3644
3645 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3646 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3647 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3648 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3649 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3650 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3651 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3652
3653 // Complex, from nxv2f64.
3654 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3655 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3656 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3657 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3658 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3659 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3660 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3661 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3662 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3663 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3664
3665 // Complex, from nxv4f32.
3666 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3667 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3668 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3669 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3670 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3671 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3672 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3673 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3674 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3675 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3676
3677 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3678 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3679 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3680 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3681 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3682
3683 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3684 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3685 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3686 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3687 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3688 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3689 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3690
3691 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3692 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3693 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3694 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3695 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3696
3697 // Complex, from nxv8f16.
3698 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3699 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3700 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3701 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3702 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3703 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3704 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3705 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3706 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3707 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3708
3709 // Complex, from nxv4f16.
3710 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3711 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3712 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3713 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3714 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3715 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3716 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3717 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3718
3719 // Complex, from nxv2f16.
3720 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3721 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3722 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3723 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3724 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3725 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3726 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3727 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3728
3729 // Truncate from nxvmf32 to nxvmf16.
3730 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3731 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3732 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3733
3734 // Truncate from nxvmf32 to nxvmbf16.
3735 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3736 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3737 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3738
3739 // Truncate from nxvmf64 to nxvmf16.
3740 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3741 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3742 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3743
3744 // Truncate from nxvmf64 to nxvmbf16.
3745 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3746 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3747 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3748
3749 // Truncate from nxvmf64 to nxvmf32.
3750 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3751 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3752 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3753
3754 // Extend from nxvmf16 to nxvmf32.
3755 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3756 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3757 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3758
3759 // Extend from nxvmbf16 to nxvmf32.
3760 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3761 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3762 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3763
3764 // Extend from nxvmf16 to nxvmf64.
3765 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3766 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3767 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3768
3769 // Extend from nxvmbf16 to nxvmf64.
3770 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3771 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3772 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3773
3774 // Extend from nxvmf32 to nxvmf64.
3775 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3776 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3777 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3778
3779 // Bitcasts from float to integer
3780 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3781 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3782 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3783
3784 // Bitcasts from integer to float
3785 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3786 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3787 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3788
3789 // Add cost for extending to illegal -too wide- scalable vectors.
3790 // zero/sign extend are implemented by multiple unpack operations,
3791 // where each operation has a cost of 1.
3792 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3793 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3794 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3795 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3796 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3797 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3798
3799 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3800 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3801 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3802 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3803 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3804 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3805 };
3806
3807 // We have to estimate a cost of fixed length operation upon
3808 // SVE registers(operations) with the number of registers required
3809 // for a fixed type to be represented upon SVE registers.
3810 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3811 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3812 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3813 ST->useSVEForFixedLengthVectors(WiderTy)) {
3814 std::pair<InstructionCost, MVT> LT =
3815 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3816 unsigned NumElements =
3817 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3818 return AdjustCost(
3819 LT.first *
3821 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3822 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3823 CostKind, I));
3824 }
3825
3826 if (const auto *Entry = ConvertCostTableLookup(
3827 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3828 return AdjustCost(Entry->Cost);
3829
3830 static const TypeConversionCostTblEntry FP16Tbl[] = {
3831 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3832 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3833 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3834 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3835 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3836 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3837 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3838 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3839 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3840 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3841 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3842 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3843 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3844 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3845 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3846 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3847 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3848 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3849 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3850 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3851 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3852 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3853 };
3854
3855 if (ST->hasFullFP16())
3856 if (const auto *Entry = ConvertCostTableLookup(
3857 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3858 return AdjustCost(Entry->Cost);
3859
3860 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3861 // double-rounding issues.
3862 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3863 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3865 return AdjustCost(
3867 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3868 CCH, CostKind) +
3870 CostKind) +
3872 CostKind));
3873
3874 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3876 ST->isSVEorStreamingSVEAvailable() &&
3877 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3879 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3881 // The standard behaviour in the backend for these cases is to split the
3882 // extend up into two parts:
3883 // 1. Perform an extending load or masked load up to the legal type.
3884 // 2. Extend the loaded data to the final type.
3885 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3886 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3888 Opcode, LegalTy, Src, CCH, CostKind, I);
3890 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3891 return Part1 + Part2;
3892 }
3893
3894 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3895 // but we also want to include the TTI::CastContextHint::Masked case too.
3896 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3898 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3900
3901 return AdjustCost(
3902 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3903}
3904
3907 VectorType *VecTy, unsigned Index,
3909
3910 // Make sure we were given a valid extend opcode.
3911 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3912 "Invalid opcode");
3913
3914 // We are extending an element we extract from a vector, so the source type
3915 // of the extend is the element type of the vector.
3916 auto *Src = VecTy->getElementType();
3917
3918 // Sign- and zero-extends are for integer types only.
3919 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3920
3921 // Get the cost for the extract. We compute the cost (if any) for the extend
3922 // below.
3923 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3924 CostKind, Index, nullptr, nullptr);
3925
3926 // Legalize the types.
3927 auto VecLT = getTypeLegalizationCost(VecTy);
3928 auto DstVT = TLI->getValueType(DL, Dst);
3929 auto SrcVT = TLI->getValueType(DL, Src);
3930
3931 // If the resulting type is still a vector and the destination type is legal,
3932 // we may get the extension for free. If not, get the default cost for the
3933 // extend.
3934 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3935 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3936 CostKind);
3937
3938 // The destination type should be larger than the element type. If not, get
3939 // the default cost for the extend.
3940 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3941 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3942 CostKind);
3943
3944 switch (Opcode) {
3945 default:
3946 llvm_unreachable("Opcode should be either SExt or ZExt");
3947
3948 // For sign-extends, we only need a smov, which performs the extension
3949 // automatically.
3950 case Instruction::SExt:
3951 return Cost;
3952
3953 // For zero-extends, the extend is performed automatically by a umov unless
3954 // the destination type is i64 and the element type is i8 or i16.
3955 case Instruction::ZExt:
3956 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3957 return Cost;
3958 }
3959
3960 // If we are unable to perform the extend for free, get the default cost.
3961 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3962 CostKind);
3963}
3964
3967 const Instruction *I) const {
3969 return Opcode == Instruction::PHI ? 0 : 1;
3970 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3971 // Branches are assumed to be predicted.
3972 return 0;
3973}
3974
3975InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3976 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3977 const Instruction *I, Value *Scalar,
3978 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3979 assert(Val->isVectorTy() && "This must be a vector type");
3980
3981 if (Index != -1U) {
3982 // Legalize the type.
3983 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3984
3985 // This type is legalized to a scalar type.
3986 if (!LT.second.isVector())
3987 return 0;
3988
3989 // The type may be split. For fixed-width vectors we can normalize the
3990 // index to the new type.
3991 if (LT.second.isFixedLengthVector()) {
3992 unsigned Width = LT.second.getVectorNumElements();
3993 Index = Index % Width;
3994 }
3995
3996 // The element at index zero is already inside the vector.
3997 // - For a insert-element or extract-element
3998 // instruction that extracts integers, an explicit FPR -> GPR move is
3999 // needed. So it has non-zero cost.
4000 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4001 return 0;
4002
4003 // This is recognising a LD1 single-element structure to one lane of one
4004 // register instruction. I.e., if this is an `insertelement` instruction,
4005 // and its second operand is a load, then we will generate a LD1, which
4006 // are expensive instructions.
4007 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
4008 return CostKind == TTI::TCK_CodeSize
4009 ? 0
4011
4012 // i1 inserts and extract will include an extra cset or cmp of the vector
4013 // value. Increase the cost by 1 to account.
4014 if (Val->getScalarSizeInBits() == 1)
4015 return CostKind == TTI::TCK_CodeSize
4016 ? 2
4018
4019 // FIXME:
4020 // If the extract-element and insert-element instructions could be
4021 // simplified away (e.g., could be combined into users by looking at use-def
4022 // context), they have no cost. This is not done in the first place for
4023 // compile-time considerations.
4024 }
4025
4026 // In case of Neon, if there exists extractelement from lane != 0 such that
4027 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4028 // 2. extractelement result feeds into fmul.
4029 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4030 // equivalent to 0.
4031 // then the extractelement can be merged with fmul in the backend and it
4032 // incurs no cost.
4033 // e.g.
4034 // define double @foo(<2 x double> %a) {
4035 // %1 = extractelement <2 x double> %a, i32 0
4036 // %2 = extractelement <2 x double> %a, i32 1
4037 // %res = fmul double %1, %2
4038 // ret double %res
4039 // }
4040 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4041 auto ExtractCanFuseWithFmul = [&]() {
4042 // We bail out if the extract is from lane 0.
4043 if (Index == 0)
4044 return false;
4045
4046 // Check if the scalar element type of the vector operand of ExtractElement
4047 // instruction is one of the allowed types.
4048 auto IsAllowedScalarTy = [&](const Type *T) {
4049 return T->isFloatTy() || T->isDoubleTy() ||
4050 (T->isHalfTy() && ST->hasFullFP16());
4051 };
4052
4053 // Check if the extractelement user is scalar fmul.
4054 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4055 // Check if the user is scalar fmul.
4056 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4057 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4058 !BO->getType()->isVectorTy();
4059 };
4060
4061 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4062 // certain scalar type and a certain vector register width.
4063 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4064 auto RegWidth =
4066 .getFixedValue();
4067 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4068 };
4069
4070 // Check if the type constraints on input vector type and result scalar type
4071 // of extractelement instruction are satisfied.
4072 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4073 return false;
4074
4075 if (Scalar) {
4076 DenseMap<User *, unsigned> UserToExtractIdx;
4077 for (auto *U : Scalar->users()) {
4078 if (!IsUserFMulScalarTy(U))
4079 return false;
4080 // Recording entry for the user is important. Index value is not
4081 // important.
4082 UserToExtractIdx[U];
4083 }
4084 if (UserToExtractIdx.empty())
4085 return false;
4086 for (auto &[S, U, L] : ScalarUserAndIdx) {
4087 for (auto *U : S->users()) {
4088 if (UserToExtractIdx.contains(U)) {
4089 auto *FMul = cast<BinaryOperator>(U);
4090 auto *Op0 = FMul->getOperand(0);
4091 auto *Op1 = FMul->getOperand(1);
4092 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4093 UserToExtractIdx[U] = L;
4094 break;
4095 }
4096 }
4097 }
4098 }
4099 for (auto &[U, L] : UserToExtractIdx) {
4100 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4101 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4102 return false;
4103 }
4104 } else {
4105 const auto *EE = cast<ExtractElementInst>(I);
4106
4107 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4108 if (!IdxOp)
4109 return false;
4110
4111 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4112 if (!IsUserFMulScalarTy(U))
4113 return false;
4114
4115 // Check if the other operand of extractelement is also extractelement
4116 // from lane equivalent to 0.
4117 const auto *BO = cast<BinaryOperator>(U);
4118 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4119 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4120 if (OtherEE) {
4121 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4122 if (!IdxOp)
4123 return false;
4124 return IsExtractLaneEquivalentToZero(
4125 cast<ConstantInt>(OtherEE->getIndexOperand())
4126 ->getValue()
4127 .getZExtValue(),
4128 OtherEE->getType()->getScalarSizeInBits());
4129 }
4130 return true;
4131 });
4132 }
4133 return true;
4134 };
4135
4136 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4137 ExtractCanFuseWithFmul())
4138 return 0;
4139
4140 // All other insert/extracts cost this much.
4141 return CostKind == TTI::TCK_CodeSize ? 1
4142 : ST->getVectorInsertExtractBaseCost();
4143}
4144
4147 unsigned Index,
4148 const Value *Op0,
4149 const Value *Op1) const {
4150 // Treat insert at lane 0 into a poison vector as having zero cost. This
4151 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4152 // single dup) are treated as cheap.
4153 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4154 isa<PoisonValue>(Op0))
4155 return 0;
4156 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4157}
4158
4160 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4161 Value *Scalar,
4162 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4163 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4164 ScalarUserAndIdx);
4165}
4166
4168 Type *Val,
4170 unsigned Index) const {
4171 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4172}
4173
4177 unsigned Index) const {
4178 if (isa<FixedVectorType>(Val))
4180 Index);
4181
4182 // This typically requires both while and lastb instructions in order
4183 // to extract the last element. If this is in a loop the while
4184 // instruction can at least be hoisted out, although it will consume a
4185 // predicate register. The cost should be more expensive than the base
4186 // extract cost, which is 2 for most CPUs.
4187 return CostKind == TTI::TCK_CodeSize
4188 ? 2
4189 : ST->getVectorInsertExtractBaseCost() + 1;
4190}
4191
4193 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4194 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4195 ArrayRef<Value *> VL) const {
4198 if (Ty->getElementType()->isFloatingPointTy())
4199 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4200 CostKind);
4201 unsigned VecInstCost =
4202 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4203 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4204}
4205
4206std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4208 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4209 std::function<InstructionCost(Type *)> InstCost) const {
4210 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4211 return std::nullopt;
4212 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4213 return std::nullopt;
4214 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4215 ST->isNonStreamingSVEorSME2Available())
4216 return std::nullopt;
4217
4218 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4219 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4221 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4222 Cost *= 2;
4223 Cost += InstCost(PromotedTy);
4224 if (IncludeTrunc)
4225 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4227 return Cost;
4228}
4229
4231 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4233 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4234
4235 // The code-generator is currently not able to handle scalable vectors
4236 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4237 // it. This change will be removed when code-generation for these types is
4238 // sufficiently reliable.
4239 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4240 if (VTy->getElementCount() == ElementCount::getScalable(1))
4242
4243 // TODO: Handle more cost kinds.
4245 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4246 Op2Info, Args, CxtI);
4247
4248 // Legalize the type.
4249 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4250 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4251
4252 // Increase the cost for half and bfloat types if not architecturally
4253 // supported.
4254 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4255 ISD == ISD::FDIV || ISD == ISD::FREM)
4256 if (auto PromotedCost = getFP16BF16PromoteCost(
4257 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4258 // There is not native support for fdiv/frem even with +sve-b16b16.
4259 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4260 [&](Type *PromotedTy) {
4261 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4262 Op1Info, Op2Info);
4263 }))
4264 return *PromotedCost;
4265
4266 // If the operation is a widening instruction (smull or umull) and both
4267 // operands are extends the cost can be cheaper by considering that the
4268 // operation will operate on the narrowest type size possible (double the
4269 // largest input size) and a further extend.
4270 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4271 if (ExtTy != Ty)
4272 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4273 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4275 return LT.first;
4276 }
4277
4278 switch (ISD) {
4279 default:
4280 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4281 Op2Info);
4282 case ISD::SREM:
4283 case ISD::SDIV:
4284 /*
4285 Notes for sdiv/srem specific costs:
4286 1. This only considers the cases where the divisor is constant, uniform and
4287 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4288 result in some form of (ldr + adrp), corresponding to constant vectors, or
4289 scalarization of the division operation.
4290 2. Constant divisors, either negative in whole or partially, don't result in
4291 significantly different codegen as compared to positive constant divisors.
4292 So, we don't consider negative divisors separately.
4293 3. If the codegen is significantly different with SVE, it has been indicated
4294 using comments at appropriate places.
4295
4296 sdiv specific cases:
4297 -----------------------------------------------------------------------
4298 codegen | pow-of-2 | Type
4299 -----------------------------------------------------------------------
4300 add + cmp + csel + asr | Y | i64
4301 add + cmp + csel + asr | Y | i32
4302 -----------------------------------------------------------------------
4303
4304 srem specific cases:
4305 -----------------------------------------------------------------------
4306 codegen | pow-of-2 | Type
4307 -----------------------------------------------------------------------
4308 negs + and + and + csneg | Y | i64
4309 negs + and + and + csneg | Y | i32
4310 -----------------------------------------------------------------------
4311
4312 other sdiv/srem cases:
4313 -------------------------------------------------------------------------
4314 common codegen | + srem | + sdiv | pow-of-2 | Type
4315 -------------------------------------------------------------------------
4316 smulh + asr + add + add | - | - | N | i64
4317 smull + lsr + add + add | - | - | N | i32
4318 usra | and + sub | sshr | Y | <2 x i64>
4319 2 * (scalar code) | - | - | N | <2 x i64>
4320 usra | bic + sub | sshr + neg | Y | <4 x i32>
4321 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4322 + sshr + usra | | | |
4323 -------------------------------------------------------------------------
4324 */
4325 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4326 InstructionCost AddCost =
4327 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4328 Op1Info.getNoProps(), Op2Info.getNoProps());
4329 InstructionCost AsrCost =
4330 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4331 Op1Info.getNoProps(), Op2Info.getNoProps());
4332 InstructionCost MulCost =
4333 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4334 Op1Info.getNoProps(), Op2Info.getNoProps());
4335 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4336 // have similar cost.
4337 auto VT = TLI->getValueType(DL, Ty);
4338 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4339 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4340 // Neg can be folded into the asr instruction.
4341 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4342 : (3 * AsrCost + AddCost);
4343 } else {
4344 return MulCost + AsrCost + 2 * AddCost;
4345 }
4346 } else if (VT.isVector()) {
4347 InstructionCost UsraCost = 2 * AsrCost;
4348 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4349 // Division with scalable types corresponds to native 'asrd'
4350 // instruction when SVE is available.
4351 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4352
4353 // One more for the negation in SDIV
4355 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4356 if (Ty->isScalableTy() && ST->hasSVE())
4357 Cost += 2 * AsrCost;
4358 else {
4359 Cost +=
4360 UsraCost +
4361 (ISD == ISD::SDIV
4362 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4363 : 2 * AddCost);
4364 }
4365 return Cost;
4366 } else if (LT.second == MVT::v2i64) {
4367 return VT.getVectorNumElements() *
4368 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4369 Op1Info.getNoProps(),
4370 Op2Info.getNoProps());
4371 } else {
4372 // When SVE is available, we get:
4373 // smulh + lsr + add/sub + asr + add/sub.
4374 if (Ty->isScalableTy() && ST->hasSVE())
4375 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4376 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4377 }
4378 }
4379 }
4380 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4381 LT.second.isFixedLengthVector()) {
4382 // FIXME: When the constant vector is non-uniform, this may result in
4383 // loading the vector from constant pool or in some cases, may also result
4384 // in scalarization. For now, we are approximating this with the
4385 // scalarization cost.
4386 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4387 CostKind, -1, nullptr, nullptr);
4388 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4389 CostKind, -1, nullptr, nullptr);
4390 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4391 return ExtractCost + InsertCost +
4392 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4393 CostKind, Op1Info.getNoProps(),
4394 Op2Info.getNoProps());
4395 }
4396 [[fallthrough]];
4397 case ISD::UDIV:
4398 case ISD::UREM: {
4399 auto VT = TLI->getValueType(DL, Ty);
4400 if (Op2Info.isConstant()) {
4401 // If the operand is a power of 2 we can use the shift or and cost.
4402 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4403 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4404 Op1Info.getNoProps(),
4405 Op2Info.getNoProps());
4406 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4407 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4408 Op1Info.getNoProps(),
4409 Op2Info.getNoProps());
4410
4411 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4412 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4413 // The MULHU will be expanded to UMULL for the types not listed below,
4414 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4415 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4416 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4417 LT.second == MVT::nxv16i8;
4418 bool Is128bit = LT.second.is128BitVector();
4419
4420 InstructionCost MulCost =
4421 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4422 Op1Info.getNoProps(), Op2Info.getNoProps());
4423 InstructionCost AddCost =
4424 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4425 Op1Info.getNoProps(), Op2Info.getNoProps());
4426 InstructionCost ShrCost =
4427 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4428 Op1Info.getNoProps(), Op2Info.getNoProps());
4429 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4430 (HasMULH ? 0 : ShrCost) + // UMULL shift
4431 AddCost * 2 + ShrCost;
4432 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4433 }
4434 }
4435
4436 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4437 // emitted by the backend even when those functions are not declared in the
4438 // module.
4439 if (!VT.isVector() && VT.getSizeInBits() > 64)
4440 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4441
4443 Opcode, Ty, CostKind, Op1Info, Op2Info);
4444 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4445 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4446 // SDIV/UDIV operations are lowered using SVE, then we can have less
4447 // costs.
4448 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4449 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4450 static const CostTblEntry DivTbl[]{
4451 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4452 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4453 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4454 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4455 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4456 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4457
4458 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4459 if (nullptr != Entry)
4460 return Entry->Cost;
4461 }
4462 // For 8/16-bit elements, the cost is higher because the type
4463 // requires promotion and possibly splitting:
4464 if (LT.second.getScalarType() == MVT::i8)
4465 Cost *= 8;
4466 else if (LT.second.getScalarType() == MVT::i16)
4467 Cost *= 4;
4468 return Cost;
4469 } else {
4470 // If one of the operands is a uniform constant then the cost for each
4471 // element is Cost for insertion, extraction and division.
4472 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4473 // operation with scalar type
4474 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4475 (Op2Info.isConstant() && Op2Info.isUniform())) {
4476 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4478 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4479 return (4 + DivCost) * VTy->getNumElements();
4480 }
4481 }
4482 // On AArch64, without SVE, vector divisions are expanded
4483 // into scalar divisions of each pair of elements.
4484 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4485 -1, nullptr, nullptr);
4486 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4487 nullptr, nullptr);
4488 }
4489
4490 // TODO: if one of the arguments is scalar, then it's not necessary to
4491 // double the cost of handling the vector elements.
4492 Cost += Cost;
4493 }
4494 return Cost;
4495 }
4496 case ISD::MUL:
4497 // When SVE is available, then we can lower the v2i64 operation using
4498 // the SVE mul instruction, which has a lower cost.
4499 if (LT.second == MVT::v2i64 && ST->hasSVE())
4500 return LT.first;
4501
4502 // When SVE is not available, there is no MUL.2d instruction,
4503 // which means mul <2 x i64> is expensive as elements are extracted
4504 // from the vectors and the muls scalarized.
4505 // As getScalarizationOverhead is a bit too pessimistic, we
4506 // estimate the cost for a i64 vector directly here, which is:
4507 // - four 2-cost i64 extracts,
4508 // - two 2-cost i64 inserts, and
4509 // - two 1-cost muls.
4510 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4511 // LT.first = 2 the cost is 28.
4512 if (LT.second != MVT::v2i64)
4513 return LT.first;
4514 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4515 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4516 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4517 nullptr, nullptr) *
4518 2 +
4519 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4520 nullptr, nullptr));
4521 case ISD::ADD:
4522 case ISD::XOR:
4523 case ISD::OR:
4524 case ISD::AND:
4525 case ISD::SRL:
4526 case ISD::SRA:
4527 case ISD::SHL:
4528 // These nodes are marked as 'custom' for combining purposes only.
4529 // We know that they are legal. See LowerAdd in ISelLowering.
4530 return LT.first;
4531
4532 case ISD::FNEG:
4533 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4534 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4535 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4536 CxtI &&
4537 ((CxtI->hasOneUse() &&
4538 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4539 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4540 return 0;
4541 [[fallthrough]];
4542 case ISD::FADD:
4543 case ISD::FSUB:
4544 if (!Ty->getScalarType()->isFP128Ty())
4545 return LT.first;
4546 [[fallthrough]];
4547 case ISD::FMUL:
4548 case ISD::FDIV:
4549 // These nodes are marked as 'custom' just to lower them to SVE.
4550 // We know said lowering will incur no additional cost.
4551 if (!Ty->getScalarType()->isFP128Ty())
4552 return 2 * LT.first;
4553
4554 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4555 Op2Info);
4556 case ISD::FREM:
4557 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4558 // those functions are not declared in the module.
4559 if (!Ty->isVectorTy())
4560 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4561 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4562 Op2Info);
4563 }
4564}
4565
4568 const SCEV *Ptr,
4570 // Address computations in vectorized code with non-consecutive addresses will
4571 // likely result in more instructions compared to scalar code where the
4572 // computation can more often be merged into the index mode. The resulting
4573 // extra micro-ops can significantly decrease throughput.
4574 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4575 int MaxMergeDistance = 64;
4576
4577 if (PtrTy->isVectorTy() && SE &&
4578 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4579 return NumVectorInstToHideOverhead;
4580
4581 // In many cases the address computation is not merged into the instruction
4582 // addressing mode.
4583 return 1;
4584}
4585
4586/// Check whether Opcode1 has less throughput according to the scheduling
4587/// model than Opcode2.
4589 unsigned Opcode1, unsigned Opcode2) const {
4590 const MCSchedModel &Sched = ST->getSchedModel();
4591 const TargetInstrInfo *TII = ST->getInstrInfo();
4592 if (!Sched.hasInstrSchedModel())
4593 return false;
4594
4595 const MCSchedClassDesc *SCD1 =
4596 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4597 const MCSchedClassDesc *SCD2 =
4598 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4599 // We cannot handle variant scheduling classes without an MI. If we need to
4600 // support them for any of the instructions we query the information of we
4601 // might need to add a way to resolve them without a MI or not use the
4602 // scheduling info.
4603 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4604 "Cannot handle variant scheduling classes without an MI");
4605 if (!SCD1->isValid() || !SCD2->isValid())
4606 return false;
4607
4608 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4610}
4611
4613 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4615 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4616 // We don't lower some vector selects well that are wider than the register
4617 // width. TODO: Improve this with different cost kinds.
4618 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4619 // We would need this many instructions to hide the scalarization happening.
4620 const int AmortizationCost = 20;
4621
4622 // If VecPred is not set, check if we can get a predicate from the context
4623 // instruction, if its type matches the requested ValTy.
4624 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4625 CmpPredicate CurrentPred;
4626 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4627 m_Value())))
4628 VecPred = CurrentPred;
4629 }
4630 // Check if we have a compare/select chain that can be lowered using
4631 // a (F)CMxx & BFI pair.
4632 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4633 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4634 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4635 VecPred == CmpInst::FCMP_UNE) {
4636 static const auto ValidMinMaxTys = {
4637 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4638 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4639 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4640
4641 auto LT = getTypeLegalizationCost(ValTy);
4642 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4643 (ST->hasFullFP16() &&
4644 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4645 return LT.first;
4646 }
4647
4648 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4649 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4650 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4651 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4652 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4653 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4654 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4655 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4656 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4657 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4658 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4659 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4660
4661 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4662 EVT SelValTy = TLI->getValueType(DL, ValTy);
4663 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4664 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4665 SelCondTy.getSimpleVT(),
4666 SelValTy.getSimpleVT()))
4667 return Entry->Cost;
4668 }
4669 }
4670
4671 if (Opcode == Instruction::FCmp) {
4672 if (auto PromotedCost = getFP16BF16PromoteCost(
4673 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4674 // TODO: Consider costing SVE FCMPs.
4675 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4677 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4678 CostKind, Op1Info, Op2Info);
4679 if (isa<VectorType>(PromotedTy))
4681 Instruction::Trunc,
4685 return Cost;
4686 }))
4687 return *PromotedCost;
4688
4689 auto LT = getTypeLegalizationCost(ValTy);
4690 // Model unknown fp compares as a libcall.
4691 if (LT.second.getScalarType() != MVT::f64 &&
4692 LT.second.getScalarType() != MVT::f32 &&
4693 LT.second.getScalarType() != MVT::f16)
4694 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4695 {ValTy, ValTy}, CostKind);
4696
4697 // Some comparison operators require expanding to multiple compares + or.
4698 unsigned Factor = 1;
4699 if (!CondTy->isVectorTy() &&
4700 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4701 Factor = 2; // fcmp with 2 selects
4702 else if (isa<FixedVectorType>(ValTy) &&
4703 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4704 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4705 Factor = 3; // fcmxx+fcmyy+or
4706 else if (isa<ScalableVectorType>(ValTy) &&
4707 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4708 Factor = 3; // fcmxx+fcmyy+or
4709
4710 if (isa<ScalableVectorType>(ValTy) &&
4712 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4713 AArch64::FCMEQv4f32))
4714 Factor *= 2;
4715
4716 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4717 }
4718
4719 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4720 // icmp(and, 0) as free, as we can make use of ands, but only if the
4721 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4722 // providing it will not cause performance regressions.
4723 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4724 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4725 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4726 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4727 if (match(I->getOperand(1), m_Zero()))
4728 return 0;
4729
4730 // x >= 1 / x < 1 -> x > 0 / x <= 0
4731 if (match(I->getOperand(1), m_One()) &&
4732 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4733 return 0;
4734
4735 // x <= -1 / x > -1 -> x > 0 / x <= 0
4736 if (match(I->getOperand(1), m_AllOnes()) &&
4737 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4738 return 0;
4739 }
4740
4741 // The base case handles scalable vectors fine for now, since it treats the
4742 // cost as 1 * legalization cost.
4743 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4744 Op1Info, Op2Info, I);
4745}
4746
4748AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4750 if (ST->requiresStrictAlign()) {
4751 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4752 // a bunch of instructions when strict align is enabled.
4753 return Options;
4754 }
4755 Options.AllowOverlappingLoads = true;
4756 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4757 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4758 // TODO: Though vector loads usually perform well on AArch64, in some targets
4759 // they may wake up the FP unit, which raises the power consumption. Perhaps
4760 // they could be used with no holds barred (-O3).
4761 Options.LoadSizes = {8, 4, 2, 1};
4762 Options.AllowedTailExpansions = {3, 5, 6};
4763 return Options;
4764}
4765
4767 return ST->hasSVE();
4768}
4769
4773 switch (MICA.getID()) {
4774 case Intrinsic::masked_scatter:
4775 case Intrinsic::masked_gather:
4776 return getGatherScatterOpCost(MICA, CostKind);
4777 case Intrinsic::masked_load:
4778 case Intrinsic::masked_store:
4779 return getMaskedMemoryOpCost(MICA, CostKind);
4780 }
4782}
4783
4787 Type *Src = MICA.getDataType();
4788
4789 if (useNeonVector(Src))
4791 auto LT = getTypeLegalizationCost(Src);
4792 if (!LT.first.isValid())
4794
4795 // Return an invalid cost for element types that we are unable to lower.
4796 auto *VT = cast<VectorType>(Src);
4797 if (VT->getElementType()->isIntegerTy(1))
4799
4800 // The code-generator is currently not able to handle scalable vectors
4801 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4802 // it. This change will be removed when code-generation for these types is
4803 // sufficiently reliable.
4804 if (VT->getElementCount() == ElementCount::getScalable(1))
4806
4807 return LT.first;
4808}
4809
4810// This function returns gather/scatter overhead either from
4811// user-provided value or specialized values per-target from \p ST.
4812static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4813 const AArch64Subtarget *ST) {
4814 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4815 "Should be called on only load or stores.");
4816 switch (Opcode) {
4817 case Instruction::Load:
4818 if (SVEGatherOverhead.getNumOccurrences() > 0)
4819 return SVEGatherOverhead;
4820 return ST->getGatherOverhead();
4821 break;
4822 case Instruction::Store:
4823 if (SVEScatterOverhead.getNumOccurrences() > 0)
4824 return SVEScatterOverhead;
4825 return ST->getScatterOverhead();
4826 break;
4827 default:
4828 llvm_unreachable("Shouldn't have reached here");
4829 }
4830}
4831
4835
4836 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
4837 MICA.getID() == Intrinsic::vp_gather)
4838 ? Instruction::Load
4839 : Instruction::Store;
4840
4841 Type *DataTy = MICA.getDataType();
4842 Align Alignment = MICA.getAlignment();
4843 const Instruction *I = MICA.getInst();
4844
4845 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4847 auto *VT = cast<VectorType>(DataTy);
4848 auto LT = getTypeLegalizationCost(DataTy);
4849 if (!LT.first.isValid())
4851
4852 // Return an invalid cost for element types that we are unable to lower.
4853 if (!LT.second.isVector() ||
4854 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4855 VT->getElementType()->isIntegerTy(1))
4857
4858 // The code-generator is currently not able to handle scalable vectors
4859 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4860 // it. This change will be removed when code-generation for these types is
4861 // sufficiently reliable.
4862 if (VT->getElementCount() == ElementCount::getScalable(1))
4864
4865 ElementCount LegalVF = LT.second.getVectorElementCount();
4866 InstructionCost MemOpCost =
4867 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4868 {TTI::OK_AnyValue, TTI::OP_None}, I);
4869 // Add on an overhead cost for using gathers/scatters.
4870 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4871 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4872}
4873
4875 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4876}
4877
4879 Align Alignment,
4880 unsigned AddressSpace,
4882 TTI::OperandValueInfo OpInfo,
4883 const Instruction *I) const {
4884 EVT VT = TLI->getValueType(DL, Ty, true);
4885 // Type legalization can't handle structs
4886 if (VT == MVT::Other)
4887 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4888 CostKind);
4889
4890 auto LT = getTypeLegalizationCost(Ty);
4891 if (!LT.first.isValid())
4893
4894 // The code-generator is currently not able to handle scalable vectors
4895 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4896 // it. This change will be removed when code-generation for these types is
4897 // sufficiently reliable.
4898 // We also only support full register predicate loads and stores.
4899 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4900 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4901 (VTy->getElementType()->isIntegerTy(1) &&
4902 !VTy->getElementCount().isKnownMultipleOf(
4905
4906 // TODO: consider latency as well for TCK_SizeAndLatency.
4908 return LT.first;
4909
4911 return 1;
4912
4913 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4914 LT.second.is128BitVector() && Alignment < Align(16)) {
4915 // Unaligned stores are extremely inefficient. We don't split all
4916 // unaligned 128-bit stores because the negative impact that has shown in
4917 // practice on inlined block copy code.
4918 // We make such stores expensive so that we will only vectorize if there
4919 // are 6 other instructions getting vectorized.
4920 const int AmortizationCost = 6;
4921
4922 return LT.first * 2 * AmortizationCost;
4923 }
4924
4925 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4926 if (Ty->isPtrOrPtrVectorTy())
4927 return LT.first;
4928
4929 if (useNeonVector(Ty)) {
4930 // Check truncating stores and extending loads.
4931 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4932 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4933 if (VT == MVT::v4i8)
4934 return 2;
4935 // Otherwise we need to scalarize.
4936 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4937 }
4938 EVT EltVT = VT.getVectorElementType();
4939 unsigned EltSize = EltVT.getScalarSizeInBits();
4940 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4941 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4942 return LT.first;
4943 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4944 // widening to v4i8, which produces suboptimal results.
4945 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4946 return LT.first;
4947
4948 // Check non-power-of-2 loads/stores for legal vector element types with
4949 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4950 // operations on smaller power-of-2 ops, including ld1/st1.
4951 LLVMContext &C = Ty->getContext();
4953 SmallVector<EVT> TypeWorklist;
4954 TypeWorklist.push_back(VT);
4955 while (!TypeWorklist.empty()) {
4956 EVT CurrVT = TypeWorklist.pop_back_val();
4957 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4958 if (isPowerOf2_32(CurrNumElements)) {
4959 Cost += 1;
4960 continue;
4961 }
4962
4963 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4964 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4965 TypeWorklist.push_back(
4966 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4967 }
4968 return Cost;
4969 }
4970
4971 return LT.first;
4972}
4973
4975 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4976 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4977 bool UseMaskForCond, bool UseMaskForGaps) const {
4978 assert(Factor >= 2 && "Invalid interleave factor");
4979 auto *VecVTy = cast<VectorType>(VecTy);
4980
4981 if (VecTy->isScalableTy() && !ST->hasSVE())
4983
4984 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4985 // only have lowering for power-of-2 factors.
4986 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4987 // InterleavedAccessPass for ld3/st3
4988 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4990
4991 // Vectorization for masked interleaved accesses is only enabled for scalable
4992 // VF.
4993 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4995
4996 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4997 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4998 auto *SubVecTy =
4999 VectorType::get(VecVTy->getElementType(),
5000 VecVTy->getElementCount().divideCoefficientBy(Factor));
5001
5002 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5003 // Accesses having vector types that are a multiple of 128 bits can be
5004 // matched to more than one ldN/stN instruction.
5005 bool UseScalable;
5006 if (MinElts % Factor == 0 &&
5007 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5008 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5009 }
5010
5011 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5012 Alignment, AddressSpace, CostKind,
5013 UseMaskForCond, UseMaskForGaps);
5014}
5015
5020 for (auto *I : Tys) {
5021 if (!I->isVectorTy())
5022 continue;
5023 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5024 128)
5025 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5026 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5027 }
5028 return Cost;
5029}
5030
5032 return ST->getMaxInterleaveFactor();
5033}
5034
5035// For Falkor, we want to avoid having too many strided loads in a loop since
5036// that can exhaust the HW prefetcher resources. We adjust the unroller
5037// MaxCount preference below to attempt to ensure unrolling doesn't create too
5038// many strided loads.
5039static void
5042 enum { MaxStridedLoads = 7 };
5043 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5044 int StridedLoads = 0;
5045 // FIXME? We could make this more precise by looking at the CFG and
5046 // e.g. not counting loads in each side of an if-then-else diamond.
5047 for (const auto BB : L->blocks()) {
5048 for (auto &I : *BB) {
5049 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5050 if (!LMemI)
5051 continue;
5052
5053 Value *PtrValue = LMemI->getPointerOperand();
5054 if (L->isLoopInvariant(PtrValue))
5055 continue;
5056
5057 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5058 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5059 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5060 continue;
5061
5062 // FIXME? We could take pairing of unrolled load copies into account
5063 // by looking at the AddRec, but we would probably have to limit this
5064 // to loops with no stores or other memory optimization barriers.
5065 ++StridedLoads;
5066 // We've seen enough strided loads that seeing more won't make a
5067 // difference.
5068 if (StridedLoads > MaxStridedLoads / 2)
5069 return StridedLoads;
5070 }
5071 }
5072 return StridedLoads;
5073 };
5074
5075 int StridedLoads = countStridedLoads(L, SE);
5076 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5077 << " strided loads\n");
5078 // Pick the largest power of 2 unroll count that won't result in too many
5079 // strided loads.
5080 if (StridedLoads) {
5081 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5082 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5083 << UP.MaxCount << '\n');
5084 }
5085}
5086
5087// This function returns true if the loop:
5088// 1. Has a valid cost, and
5089// 2. Has a cost within the supplied budget.
5090// Otherwise it returns false.
5092 InstructionCost Budget,
5093 unsigned *FinalSize) {
5094 // Estimate the size of the loop.
5095 InstructionCost LoopCost = 0;
5096
5097 for (auto *BB : L->getBlocks()) {
5098 for (auto &I : *BB) {
5099 SmallVector<const Value *, 4> Operands(I.operand_values());
5100 InstructionCost Cost =
5101 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5102 // This can happen with intrinsics that don't currently have a cost model
5103 // or for some operations that require SVE.
5104 if (!Cost.isValid())
5105 return false;
5106
5107 LoopCost += Cost;
5108 if (LoopCost > Budget)
5109 return false;
5110 }
5111 }
5112
5113 if (FinalSize)
5114 *FinalSize = LoopCost.getValue();
5115 return true;
5116}
5117
5119 const AArch64TTIImpl &TTI) {
5120 // Only consider loops with unknown trip counts for which we can determine
5121 // a symbolic expression. Multi-exit loops with small known trip counts will
5122 // likely be unrolled anyway.
5123 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5125 return false;
5126
5127 // It might not be worth unrolling loops with low max trip counts. Restrict
5128 // this to max trip counts > 32 for now.
5129 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5130 if (MaxTC > 0 && MaxTC <= 32)
5131 return false;
5132
5133 // Make sure the loop size is <= 5.
5134 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5135 return false;
5136
5137 // Small search loops with multiple exits can be highly beneficial to unroll.
5138 // We only care about loops with exactly two exiting blocks, although each
5139 // block could jump to the same exit block.
5140 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5141 if (Blocks.size() != 2)
5142 return false;
5143
5144 if (any_of(Blocks, [](BasicBlock *BB) {
5145 return !isa<BranchInst>(BB->getTerminator());
5146 }))
5147 return false;
5148
5149 return true;
5150}
5151
5152/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5153/// OOO engine's wide instruction window and various predictors.
5154static void
5157 const AArch64TTIImpl &TTI) {
5158 // Limit loops with structure that is highly likely to benefit from runtime
5159 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5160 // likely with complex control flow). Note that the heuristics here may be
5161 // overly conservative and we err on the side of avoiding runtime unrolling
5162 // rather than unroll excessively. They are all subject to further refinement.
5163 if (!L->isInnermost() || L->getNumBlocks() > 8)
5164 return;
5165
5166 // Loops with multiple exits are handled by common code.
5167 if (!L->getExitBlock())
5168 return;
5169
5170 // Check if the loop contains any reductions that could be parallelized when
5171 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5172 // a multiple of 2.
5173 bool HasParellelizableReductions =
5174 L->getNumBlocks() == 1 &&
5175 any_of(L->getHeader()->phis(),
5176 [&SE, L](PHINode &Phi) {
5177 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5178 }) &&
5179 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5180 if (HasParellelizableReductions &&
5181 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5182 UP.Partial = true;
5183 UP.MaxCount = 4;
5184 UP.AddAdditionalAccumulators = true;
5185 }
5186
5187 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5189 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5190 SE.getSmallConstantMaxTripCount(L) <= 32))
5191 return;
5192
5193 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5194 return;
5195
5197 return;
5198
5199 // Limit to loops with trip counts that are cheap to expand.
5200 UP.SCEVExpansionBudget = 1;
5201
5202 if (HasParellelizableReductions) {
5203 UP.Runtime = true;
5205 UP.AddAdditionalAccumulators = true;
5206 }
5207
5208 // Try to unroll small loops, of few-blocks with low budget, if they have
5209 // load/store dependencies, to expose more parallel memory access streams,
5210 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5211 BasicBlock *Header = L->getHeader();
5212 BasicBlock *Latch = L->getLoopLatch();
5213 if (Header == Latch) {
5214 // Estimate the size of the loop.
5215 unsigned Size;
5216 unsigned Width = 10;
5217 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5218 return;
5219
5220 // Try to find an unroll count that maximizes the use of the instruction
5221 // window, i.e. trying to fetch as many instructions per cycle as possible.
5222 unsigned MaxInstsPerLine = 16;
5223 unsigned UC = 1;
5224 unsigned BestUC = 1;
5225 unsigned SizeWithBestUC = BestUC * Size;
5226 while (UC <= 8) {
5227 unsigned SizeWithUC = UC * Size;
5228 if (SizeWithUC > 48)
5229 break;
5230 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5231 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5232 BestUC = UC;
5233 SizeWithBestUC = BestUC * Size;
5234 }
5235 UC++;
5236 }
5237
5238 if (BestUC == 1)
5239 return;
5240
5241 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5243 for (auto *BB : L->blocks()) {
5244 for (auto &I : *BB) {
5246 if (!Ptr)
5247 continue;
5248 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5249 if (SE.isLoopInvariant(PtrSCEV, L))
5250 continue;
5251 if (isa<LoadInst>(&I)) {
5252 LoadedValuesPlus.insert(&I);
5253 // Include in-loop 1st users of loaded values.
5254 for (auto *U : I.users())
5255 if (L->contains(cast<Instruction>(U)))
5256 LoadedValuesPlus.insert(U);
5257 } else
5258 Stores.push_back(cast<StoreInst>(&I));
5259 }
5260 }
5261
5262 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5263 return LoadedValuesPlus.contains(SI->getOperand(0));
5264 }))
5265 return;
5266
5267 UP.Runtime = true;
5268 UP.DefaultUnrollRuntimeCount = BestUC;
5269 return;
5270 }
5271
5272 // Try to runtime-unroll loops with early-continues depending on loop-varying
5273 // loads; this helps with branch-prediction for the early-continues.
5274 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5276 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5277 !llvm::is_contained(Preds, Header) ||
5278 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5279 return;
5280
5281 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5282 [&](Instruction *I, unsigned Depth) -> bool {
5283 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5284 return false;
5285
5286 if (isa<LoadInst>(I))
5287 return true;
5288
5289 return any_of(I->operands(), [&](Value *V) {
5290 auto *I = dyn_cast<Instruction>(V);
5291 return I && DependsOnLoopLoad(I, Depth + 1);
5292 });
5293 };
5294 CmpPredicate Pred;
5295 Instruction *I;
5296 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5297 m_Value())) &&
5298 DependsOnLoopLoad(I, 0)) {
5299 UP.Runtime = true;
5300 }
5301}
5302
5305 OptimizationRemarkEmitter *ORE) const {
5306 // Enable partial unrolling and runtime unrolling.
5307 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5308
5309 UP.UpperBound = true;
5310
5311 // For inner loop, it is more likely to be a hot one, and the runtime check
5312 // can be promoted out from LICM pass, so the overhead is less, let's try
5313 // a larger threshold to unroll more loops.
5314 if (L->getLoopDepth() > 1)
5315 UP.PartialThreshold *= 2;
5316
5317 // Disable partial & runtime unrolling on -Os.
5319
5320 // Scan the loop: don't unroll loops with calls as this could prevent
5321 // inlining. Don't unroll auto-vectorized loops either, though do allow
5322 // unrolling of the scalar remainder.
5323 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5325 for (auto *BB : L->getBlocks()) {
5326 for (auto &I : *BB) {
5327 // Both auto-vectorized loops and the scalar remainder have the
5328 // isvectorized attribute, so differentiate between them by the presence
5329 // of vector instructions.
5330 if (IsVectorized && I.getType()->isVectorTy())
5331 return;
5332 if (isa<CallBase>(I)) {
5335 if (!isLoweredToCall(F))
5336 continue;
5337 return;
5338 }
5339
5340 SmallVector<const Value *, 4> Operands(I.operand_values());
5341 Cost += getInstructionCost(&I, Operands,
5343 }
5344 }
5345
5346 // Apply subtarget-specific unrolling preferences.
5347 if (ST->isAppleMLike())
5348 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5349 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5352
5353 // If this is a small, multi-exit loop similar to something like std::find,
5354 // then there is typically a performance improvement achieved by unrolling.
5355 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5356 UP.RuntimeUnrollMultiExit = true;
5357 UP.Runtime = true;
5358 // Limit unroll count.
5360 // Allow slightly more costly trip-count expansion to catch search loops
5361 // with pointer inductions.
5362 UP.SCEVExpansionBudget = 5;
5363 return;
5364 }
5365
5366 // Enable runtime unrolling for in-order models
5367 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5368 // checking for that case, we can ensure that the default behaviour is
5369 // unchanged
5370 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5371 !ST->getSchedModel().isOutOfOrder()) {
5372 UP.Runtime = true;
5373 UP.Partial = true;
5374 UP.UnrollRemainder = true;
5376
5377 UP.UnrollAndJam = true;
5379 }
5380
5381 // Force unrolling small loops can be very useful because of the branch
5382 // taken cost of the backedge.
5384 UP.Force = true;
5385}
5386
5391
5393 Type *ExpectedType,
5394 bool CanCreate) const {
5395 switch (Inst->getIntrinsicID()) {
5396 default:
5397 return nullptr;
5398 case Intrinsic::aarch64_neon_st2:
5399 case Intrinsic::aarch64_neon_st3:
5400 case Intrinsic::aarch64_neon_st4: {
5401 // Create a struct type
5402 StructType *ST = dyn_cast<StructType>(ExpectedType);
5403 if (!CanCreate || !ST)
5404 return nullptr;
5405 unsigned NumElts = Inst->arg_size() - 1;
5406 if (ST->getNumElements() != NumElts)
5407 return nullptr;
5408 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5409 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5410 return nullptr;
5411 }
5412 Value *Res = PoisonValue::get(ExpectedType);
5413 IRBuilder<> Builder(Inst);
5414 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5415 Value *L = Inst->getArgOperand(i);
5416 Res = Builder.CreateInsertValue(Res, L, i);
5417 }
5418 return Res;
5419 }
5420 case Intrinsic::aarch64_neon_ld2:
5421 case Intrinsic::aarch64_neon_ld3:
5422 case Intrinsic::aarch64_neon_ld4:
5423 if (Inst->getType() == ExpectedType)
5424 return Inst;
5425 return nullptr;
5426 }
5427}
5428
5430 MemIntrinsicInfo &Info) const {
5431 switch (Inst->getIntrinsicID()) {
5432 default:
5433 break;
5434 case Intrinsic::aarch64_neon_ld2:
5435 case Intrinsic::aarch64_neon_ld3:
5436 case Intrinsic::aarch64_neon_ld4:
5437 Info.ReadMem = true;
5438 Info.WriteMem = false;
5439 Info.PtrVal = Inst->getArgOperand(0);
5440 break;
5441 case Intrinsic::aarch64_neon_st2:
5442 case Intrinsic::aarch64_neon_st3:
5443 case Intrinsic::aarch64_neon_st4:
5444 Info.ReadMem = false;
5445 Info.WriteMem = true;
5446 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5447 break;
5448 }
5449
5450 switch (Inst->getIntrinsicID()) {
5451 default:
5452 return false;
5453 case Intrinsic::aarch64_neon_ld2:
5454 case Intrinsic::aarch64_neon_st2:
5455 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5456 break;
5457 case Intrinsic::aarch64_neon_ld3:
5458 case Intrinsic::aarch64_neon_st3:
5459 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5460 break;
5461 case Intrinsic::aarch64_neon_ld4:
5462 case Intrinsic::aarch64_neon_st4:
5463 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5464 break;
5465 }
5466 return true;
5467}
5468
5469/// See if \p I should be considered for address type promotion. We check if \p
5470/// I is a sext with right type and used in memory accesses. If it used in a
5471/// "complex" getelementptr, we allow it to be promoted without finding other
5472/// sext instructions that sign extended the same initial value. A getelementptr
5473/// is considered as "complex" if it has more than 2 operands.
5475 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5476 bool Considerable = false;
5477 AllowPromotionWithoutCommonHeader = false;
5478 if (!isa<SExtInst>(&I))
5479 return false;
5480 Type *ConsideredSExtType =
5481 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5482 if (I.getType() != ConsideredSExtType)
5483 return false;
5484 // See if the sext is the one with the right type and used in at least one
5485 // GetElementPtrInst.
5486 for (const User *U : I.users()) {
5487 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5488 Considerable = true;
5489 // A getelementptr is considered as "complex" if it has more than 2
5490 // operands. We will promote a SExt used in such complex GEP as we
5491 // expect some computation to be merged if they are done on 64 bits.
5492 if (GEPInst->getNumOperands() > 2) {
5493 AllowPromotionWithoutCommonHeader = true;
5494 break;
5495 }
5496 }
5497 }
5498 return Considerable;
5499}
5500
5502 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5503 if (!VF.isScalable())
5504 return true;
5505
5506 Type *Ty = RdxDesc.getRecurrenceType();
5507 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5508 return false;
5509
5510 switch (RdxDesc.getRecurrenceKind()) {
5511 case RecurKind::Sub:
5513 case RecurKind::Add:
5514 case RecurKind::FAdd:
5515 case RecurKind::And:
5516 case RecurKind::Or:
5517 case RecurKind::Xor:
5518 case RecurKind::SMin:
5519 case RecurKind::SMax:
5520 case RecurKind::UMin:
5521 case RecurKind::UMax:
5522 case RecurKind::FMin:
5523 case RecurKind::FMax:
5524 case RecurKind::FMulAdd:
5525 case RecurKind::AnyOf:
5526 return true;
5527 default:
5528 return false;
5529 }
5530}
5531
5534 FastMathFlags FMF,
5536 // The code-generator is currently not able to handle scalable vectors
5537 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5538 // it. This change will be removed when code-generation for these types is
5539 // sufficiently reliable.
5540 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5541 if (VTy->getElementCount() == ElementCount::getScalable(1))
5543
5544 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5545
5546 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5547 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5548
5549 InstructionCost LegalizationCost = 0;
5550 if (LT.first > 1) {
5551 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5552 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5553 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5554 }
5555
5556 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5557}
5558
5560 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5561 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5562 InstructionCost LegalizationCost = 0;
5563 if (LT.first > 1) {
5564 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5565 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5566 LegalizationCost *= LT.first - 1;
5567 }
5568
5569 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5570 assert(ISD && "Invalid opcode");
5571 // Add the final reduction cost for the legal horizontal reduction
5572 switch (ISD) {
5573 case ISD::ADD:
5574 case ISD::AND:
5575 case ISD::OR:
5576 case ISD::XOR:
5577 case ISD::FADD:
5578 return LegalizationCost + 2;
5579 default:
5581 }
5582}
5583
5586 std::optional<FastMathFlags> FMF,
5588 // The code-generator is currently not able to handle scalable vectors
5589 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5590 // it. This change will be removed when code-generation for these types is
5591 // sufficiently reliable.
5592 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5593 if (VTy->getElementCount() == ElementCount::getScalable(1))
5595
5597 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5598 InstructionCost BaseCost =
5599 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5600 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5601 // end up vectorizing for more computationally intensive loops.
5602 return BaseCost + FixedVTy->getNumElements();
5603 }
5604
5605 if (Opcode != Instruction::FAdd)
5607
5608 auto *VTy = cast<ScalableVectorType>(ValTy);
5610 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5611 Cost *= getMaxNumElements(VTy->getElementCount());
5612 return Cost;
5613 }
5614
5615 if (isa<ScalableVectorType>(ValTy))
5616 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5617
5618 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5619 MVT MTy = LT.second;
5620 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5621 assert(ISD && "Invalid opcode");
5622
5623 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5624 // instructions as twice a normal vector add, plus 1 for each legalization
5625 // step (LT.first). This is the only arithmetic vector reduction operation for
5626 // which we have an instruction.
5627 // OR, XOR and AND costs should match the codegen from:
5628 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5629 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5630 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5631 static const CostTblEntry CostTblNoPairwise[]{
5632 {ISD::ADD, MVT::v8i8, 2},
5633 {ISD::ADD, MVT::v16i8, 2},
5634 {ISD::ADD, MVT::v4i16, 2},
5635 {ISD::ADD, MVT::v8i16, 2},
5636 {ISD::ADD, MVT::v2i32, 2},
5637 {ISD::ADD, MVT::v4i32, 2},
5638 {ISD::ADD, MVT::v2i64, 2},
5639 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5640 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5641 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5642 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5643 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5644 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5645 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5646 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5647 {ISD::XOR, MVT::v16i8, 7},
5648 {ISD::XOR, MVT::v4i16, 4},
5649 {ISD::XOR, MVT::v8i16, 6},
5650 {ISD::XOR, MVT::v2i32, 3},
5651 {ISD::XOR, MVT::v4i32, 5},
5652 {ISD::XOR, MVT::v2i64, 3},
5653 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5654 {ISD::AND, MVT::v16i8, 7},
5655 {ISD::AND, MVT::v4i16, 4},
5656 {ISD::AND, MVT::v8i16, 6},
5657 {ISD::AND, MVT::v2i32, 3},
5658 {ISD::AND, MVT::v4i32, 5},
5659 {ISD::AND, MVT::v2i64, 3},
5660 };
5661 switch (ISD) {
5662 default:
5663 break;
5664 case ISD::FADD:
5665 if (Type *EltTy = ValTy->getScalarType();
5666 // FIXME: For half types without fullfp16 support, this could extend and
5667 // use a fp32 faddp reduction but current codegen unrolls.
5668 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5669 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5670 const unsigned NElts = MTy.getVectorNumElements();
5671 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5672 isPowerOf2_32(NElts))
5673 // Reduction corresponding to series of fadd instructions is lowered to
5674 // series of faddp instructions. faddp has latency/throughput that
5675 // matches fadd instruction and hence, every faddp instruction can be
5676 // considered to have a relative cost = 1 with
5677 // CostKind = TCK_RecipThroughput.
5678 // An faddp will pairwise add vector elements, so the size of input
5679 // vector reduces by half every time, requiring
5680 // #(faddp instructions) = log2_32(NElts).
5681 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5682 }
5683 break;
5684 case ISD::ADD:
5685 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5686 return (LT.first - 1) + Entry->Cost;
5687 break;
5688 case ISD::XOR:
5689 case ISD::AND:
5690 case ISD::OR:
5691 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5692 if (!Entry)
5693 break;
5694 auto *ValVTy = cast<FixedVectorType>(ValTy);
5695 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5696 isPowerOf2_32(ValVTy->getNumElements())) {
5697 InstructionCost ExtraCost = 0;
5698 if (LT.first != 1) {
5699 // Type needs to be split, so there is an extra cost of LT.first - 1
5700 // arithmetic ops.
5701 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5702 MTy.getVectorNumElements());
5703 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5704 ExtraCost *= LT.first - 1;
5705 }
5706 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5707 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5708 return Cost + ExtraCost;
5709 }
5710 break;
5711 }
5712 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5713}
5714
5716 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5717 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5718 EVT VecVT = TLI->getValueType(DL, VecTy);
5719 EVT ResVT = TLI->getValueType(DL, ResTy);
5720
5721 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5722 VecVT.getSizeInBits() >= 64) {
5723 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5724
5725 // The legal cases are:
5726 // UADDLV 8/16/32->32
5727 // UADDLP 32->64
5728 unsigned RevVTSize = ResVT.getSizeInBits();
5729 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5730 RevVTSize <= 32) ||
5731 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5732 RevVTSize <= 32) ||
5733 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5734 RevVTSize <= 64))
5735 return (LT.first - 1) * 2 + 2;
5736 }
5737
5738 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5739 CostKind);
5740}
5741
5743AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5744 Type *ResTy, VectorType *VecTy,
5746 EVT VecVT = TLI->getValueType(DL, VecTy);
5747 EVT ResVT = TLI->getValueType(DL, ResTy);
5748
5749 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5750 RedOpcode == Instruction::Add) {
5751 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5752
5753 // The legal cases with dotprod are
5754 // UDOT 8->32
5755 // Which requires an additional uaddv to sum the i32 values.
5756 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5757 ResVT == MVT::i32)
5758 return LT.first + 2;
5759 }
5760
5761 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5762 CostKind);
5763}
5764
5768 static const CostTblEntry ShuffleTbl[] = {
5769 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5770 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5771 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5772 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5773 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5774 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5775 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5776 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5777 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5778 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5779 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5780 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5781 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5782 };
5783
5784 // The code-generator is currently not able to handle scalable vectors
5785 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5786 // it. This change will be removed when code-generation for these types is
5787 // sufficiently reliable.
5790
5791 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5792 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5793 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5794 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5795 : LT.second;
5796 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5797 InstructionCost LegalizationCost = 0;
5798 if (Index < 0) {
5799 LegalizationCost =
5800 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5802 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5804 }
5805
5806 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5807 // Cost performed on a promoted type.
5808 if (LT.second.getScalarType() == MVT::i1) {
5809 LegalizationCost +=
5810 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5812 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5814 }
5815 const auto *Entry =
5816 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5817 assert(Entry && "Illegal Type for Splice");
5818 LegalizationCost += Entry->Cost;
5819 return LegalizationCost * LT.first;
5820}
5821
5823 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5825 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5828
5830 return Invalid;
5831
5832 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5833 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5834 return Invalid;
5835
5836 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5837 OpAExtend == TTI::PR_None)
5838 return Invalid;
5839
5840 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5841 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5842 "Unexpected values for OpBExtend or InputTypeB");
5843
5844 // We only support multiply binary operations for now, and for muls we
5845 // require the types being extended to be the same.
5846 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5847 return Invalid;
5848
5849 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5850 if (IsUSDot && !ST->hasMatMulInt8())
5851 return Invalid;
5852
5853 unsigned Ratio =
5854 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5855 if (VF.getKnownMinValue() <= Ratio)
5856 return Invalid;
5857
5858 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5859 VectorType *AccumVectorType =
5860 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5861 // We don't yet support all kinds of legalization.
5862 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5863 EVT::getEVT(AccumVectorType));
5864 switch (TC.first) {
5865 default:
5866 return Invalid;
5870 // The legalised type (e.g. after splitting) must be legal too.
5871 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5873 return Invalid;
5874 break;
5875 }
5876
5877 std::pair<InstructionCost, MVT> AccumLT =
5878 getTypeLegalizationCost(AccumVectorType);
5879 std::pair<InstructionCost, MVT> InputLT =
5880 getTypeLegalizationCost(InputVectorType);
5881
5882 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5883
5884 // Prefer using full types by costing half-full input types as more expensive.
5885 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5887 // FIXME: This can be removed after the cost of the extends are folded into
5888 // the dot-product expression in VPlan, after landing:
5889 // https://github.com/llvm/llvm-project/pull/147302
5890 Cost *= 2;
5891
5892 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5893 // i16 -> i64 is natively supported for udot/sdot
5894 if (AccumLT.second.getScalarType() == MVT::i64 &&
5895 InputLT.second.getScalarType() == MVT::i16)
5896 return Cost;
5897 // i8 -> i64 is supported with an extra level of extends
5898 if (AccumLT.second.getScalarType() == MVT::i64 &&
5899 InputLT.second.getScalarType() == MVT::i8)
5900 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5901 // because it requires two extra extends on the inputs. But if we'd change
5902 // that now, a regular reduction would be cheaper because the costs of
5903 // the extends in the IR are still counted. This can be fixed
5904 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5905 return Cost;
5906 }
5907
5908 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5909 if (ST->isSVEorStreamingSVEAvailable() ||
5910 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5911 ST->hasDotProd())) {
5912 if (AccumLT.second.getScalarType() == MVT::i32 &&
5913 InputLT.second.getScalarType() == MVT::i8)
5914 return Cost;
5915 }
5916
5917 // Add additional cost for the extends that would need to be inserted.
5918 return Cost + 2;
5919}
5920
5923 VectorType *SrcTy, ArrayRef<int> Mask,
5924 TTI::TargetCostKind CostKind, int Index,
5926 const Instruction *CxtI) const {
5927 assert((Mask.empty() || DstTy->isScalableTy() ||
5928 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5929 "Expected the Mask to match the return size if given");
5930 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5931 "Expected the same scalar types");
5932 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5933
5934 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5935 // into smaller vectors and sum the cost of each shuffle.
5936 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5937 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5938 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5939 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5940 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5941 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5942 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5943 // cost than just the load.
5944 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5947 return std::max<InstructionCost>(1, LT.first / 4);
5948
5949 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5950 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5951 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5952 // cost than just the store.
5953 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5955 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5957 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5958 return LT.first;
5959
5960 unsigned TpNumElts = Mask.size();
5961 unsigned LTNumElts = LT.second.getVectorNumElements();
5962 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5963 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5964 LT.second.getVectorElementCount());
5966 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5967 PreviousCosts;
5968 for (unsigned N = 0; N < NumVecs; N++) {
5969 SmallVector<int> NMask;
5970 // Split the existing mask into chunks of size LTNumElts. Track the source
5971 // sub-vectors to ensure the result has at most 2 inputs.
5972 unsigned Source1 = -1U, Source2 = -1U;
5973 unsigned NumSources = 0;
5974 for (unsigned E = 0; E < LTNumElts; E++) {
5975 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5977 if (MaskElt < 0) {
5979 continue;
5980 }
5981
5982 // Calculate which source from the input this comes from and whether it
5983 // is new to us.
5984 unsigned Source = MaskElt / LTNumElts;
5985 if (NumSources == 0) {
5986 Source1 = Source;
5987 NumSources = 1;
5988 } else if (NumSources == 1 && Source != Source1) {
5989 Source2 = Source;
5990 NumSources = 2;
5991 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5992 NumSources++;
5993 }
5994
5995 // Add to the new mask. For the NumSources>2 case these are not correct,
5996 // but are only used for the modular lane number.
5997 if (Source == Source1)
5998 NMask.push_back(MaskElt % LTNumElts);
5999 else if (Source == Source2)
6000 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6001 else
6002 NMask.push_back(MaskElt % LTNumElts);
6003 }
6004 // Check if we have already generated this sub-shuffle, which means we
6005 // will have already generated the output. For example a <16 x i32> splat
6006 // will be the same sub-splat 4 times, which only needs to be generated
6007 // once and reused.
6008 auto Result =
6009 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6010 // Check if it was already in the map (already costed).
6011 if (!Result.second)
6012 continue;
6013 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6014 // getShuffleCost. If not then cost it using the worst case as the number
6015 // of element moves into a new vector.
6016 InstructionCost NCost =
6017 NumSources <= 2
6018 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6020 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6021 CxtI)
6022 : LTNumElts;
6023 Result.first->second = NCost;
6024 Cost += NCost;
6025 }
6026 return Cost;
6027 }
6028
6029 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6030 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6031 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6032 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6033 // This currently only handles low or high extracts to prevent SLP vectorizer
6034 // regressions.
6035 // Note that SVE's ext instruction is destructive, but it can be fused with
6036 // a movprfx to act like a constructive instruction.
6037 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6038 if (LT.second.getFixedSizeInBits() >= 128 &&
6039 cast<FixedVectorType>(SubTp)->getNumElements() ==
6040 LT.second.getVectorNumElements() / 2) {
6041 if (Index == 0)
6042 return 0;
6043 if (Index == (int)LT.second.getVectorNumElements() / 2)
6044 return 1;
6045 }
6047 }
6048 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6049 // the code to handle length-changing shuffles.
6050 if (Kind == TTI::SK_InsertSubvector) {
6051 LT = getTypeLegalizationCost(DstTy);
6052 SrcTy = DstTy;
6053 }
6054
6055 // Check for identity masks, which we can treat as free for both fixed and
6056 // scalable vector paths.
6057 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6058 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6059 all_of(enumerate(Mask), [](const auto &M) {
6060 return M.value() < 0 || M.value() == (int)M.index();
6061 }))
6062 return 0;
6063
6064 // Segmented shuffle matching.
6065 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6066 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6067 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6069
6071 unsigned Segments =
6073 unsigned SegmentElts = VTy->getNumElements() / Segments;
6074
6075 // dupq zd.t, zn.t[idx]
6076 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6077 ST->isSVEorStreamingSVEAvailable() &&
6078 isDUPQMask(Mask, Segments, SegmentElts))
6079 return LT.first;
6080
6081 // mov zd.q, vn
6082 if (ST->isSVEorStreamingSVEAvailable() &&
6083 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6084 return LT.first;
6085 }
6086
6087 // Check for broadcast loads, which are supported by the LD1R instruction.
6088 // In terms of code-size, the shuffle vector is free when a load + dup get
6089 // folded into a LD1R. That's what we check and return here. For performance
6090 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6091 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6092 // that we model the load + dup sequence slightly higher because LD1R is a
6093 // high latency instruction.
6094 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6095 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6096 if (IsLoad && LT.second.isVector() &&
6097 isLegalBroadcastLoad(SrcTy->getElementType(),
6098 LT.second.getVectorElementCount()))
6099 return 0;
6100 }
6101
6102 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6103 // from the perfect shuffle tables.
6104 if (Mask.size() == 4 &&
6105 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6106 (SrcTy->getScalarSizeInBits() == 16 ||
6107 SrcTy->getScalarSizeInBits() == 32) &&
6108 all_of(Mask, [](int E) { return E < 8; }))
6109 return getPerfectShuffleCost(Mask);
6110
6111 // Check for other shuffles that are not SK_ kinds but we have native
6112 // instructions for, for example ZIP and UZP.
6113 unsigned Unused;
6114 if (LT.second.isFixedLengthVector() &&
6115 LT.second.getVectorNumElements() == Mask.size() &&
6116 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6117 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6118 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6119 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6120 LT.second.getVectorNumElements(), 16) ||
6121 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6122 LT.second.getVectorNumElements(), 32) ||
6123 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6124 LT.second.getVectorNumElements(), 64) ||
6125 // Check for non-zero lane splats
6126 all_of(drop_begin(Mask),
6127 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6128 return 1;
6129
6130 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6131 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6132 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6133 static const CostTblEntry ShuffleTbl[] = {
6134 // Broadcast shuffle kinds can be performed with 'dup'.
6135 {TTI::SK_Broadcast, MVT::v8i8, 1},
6136 {TTI::SK_Broadcast, MVT::v16i8, 1},
6137 {TTI::SK_Broadcast, MVT::v4i16, 1},
6138 {TTI::SK_Broadcast, MVT::v8i16, 1},
6139 {TTI::SK_Broadcast, MVT::v2i32, 1},
6140 {TTI::SK_Broadcast, MVT::v4i32, 1},
6141 {TTI::SK_Broadcast, MVT::v2i64, 1},
6142 {TTI::SK_Broadcast, MVT::v4f16, 1},
6143 {TTI::SK_Broadcast, MVT::v8f16, 1},
6144 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6145 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6146 {TTI::SK_Broadcast, MVT::v2f32, 1},
6147 {TTI::SK_Broadcast, MVT::v4f32, 1},
6148 {TTI::SK_Broadcast, MVT::v2f64, 1},
6149 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6150 // 'zip1/zip2' instructions.
6151 {TTI::SK_Transpose, MVT::v8i8, 1},
6152 {TTI::SK_Transpose, MVT::v16i8, 1},
6153 {TTI::SK_Transpose, MVT::v4i16, 1},
6154 {TTI::SK_Transpose, MVT::v8i16, 1},
6155 {TTI::SK_Transpose, MVT::v2i32, 1},
6156 {TTI::SK_Transpose, MVT::v4i32, 1},
6157 {TTI::SK_Transpose, MVT::v2i64, 1},
6158 {TTI::SK_Transpose, MVT::v4f16, 1},
6159 {TTI::SK_Transpose, MVT::v8f16, 1},
6160 {TTI::SK_Transpose, MVT::v4bf16, 1},
6161 {TTI::SK_Transpose, MVT::v8bf16, 1},
6162 {TTI::SK_Transpose, MVT::v2f32, 1},
6163 {TTI::SK_Transpose, MVT::v4f32, 1},
6164 {TTI::SK_Transpose, MVT::v2f64, 1},
6165 // Select shuffle kinds.
6166 // TODO: handle vXi8/vXi16.
6167 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6168 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6169 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6170 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6171 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6172 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6173 // PermuteSingleSrc shuffle kinds.
6174 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6175 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6176 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6177 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6178 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6179 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6180 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6181 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6182 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6183 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6184 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6185 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6186 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6187 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6188 // Reverse can be lowered with `rev`.
6189 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6190 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6191 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6192 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6193 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6194 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6195 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6196 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6197 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6198 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6199 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6200 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6201 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6202 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6203 // Splice can all be lowered as `ext`.
6204 {TTI::SK_Splice, MVT::v2i32, 1},
6205 {TTI::SK_Splice, MVT::v4i32, 1},
6206 {TTI::SK_Splice, MVT::v2i64, 1},
6207 {TTI::SK_Splice, MVT::v2f32, 1},
6208 {TTI::SK_Splice, MVT::v4f32, 1},
6209 {TTI::SK_Splice, MVT::v2f64, 1},
6210 {TTI::SK_Splice, MVT::v8f16, 1},
6211 {TTI::SK_Splice, MVT::v8bf16, 1},
6212 {TTI::SK_Splice, MVT::v8i16, 1},
6213 {TTI::SK_Splice, MVT::v16i8, 1},
6214 {TTI::SK_Splice, MVT::v4f16, 1},
6215 {TTI::SK_Splice, MVT::v4bf16, 1},
6216 {TTI::SK_Splice, MVT::v4i16, 1},
6217 {TTI::SK_Splice, MVT::v8i8, 1},
6218 // Broadcast shuffle kinds for scalable vectors
6219 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6220 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6221 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6222 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6223 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6224 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6225 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6226 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6227 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6228 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6229 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6230 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6231 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6232 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6233 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6234 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6235 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6236 // Handle the cases for vector.reverse with scalable vectors
6237 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6238 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6239 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6240 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6241 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6242 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6243 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6244 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6245 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6246 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6247 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6248 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6249 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6250 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6251 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6252 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6253 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6254 };
6255 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6256 return LT.first * Entry->Cost;
6257 }
6258
6259 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6260 return getSpliceCost(SrcTy, Index, CostKind);
6261
6262 // Inserting a subvector can often be done with either a D, S or H register
6263 // move, so long as the inserted vector is "aligned".
6264 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6265 LT.second.getSizeInBits() <= 128 && SubTp) {
6266 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6267 if (SubLT.second.isVector()) {
6268 int NumElts = LT.second.getVectorNumElements();
6269 int NumSubElts = SubLT.second.getVectorNumElements();
6270 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6271 return SubLT.first;
6272 }
6273 }
6274
6275 // Restore optimal kind.
6276 if (IsExtractSubvector)
6278 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6279 Args, CxtI);
6280}
6281
6284 const DominatorTree &DT) {
6285 const auto &Strides = DenseMap<Value *, const SCEV *>();
6286 for (BasicBlock *BB : TheLoop->blocks()) {
6287 // Scan the instructions in the block and look for addresses that are
6288 // consecutive and decreasing.
6289 for (Instruction &I : *BB) {
6290 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6292 Type *AccessTy = getLoadStoreType(&I);
6293 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6294 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6295 .value_or(0) < 0)
6296 return true;
6297 }
6298 }
6299 }
6300 return false;
6301}
6302
6304 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6306 // For cases like post-LTO vectorization, when we eventually know the trip
6307 // count, epilogue with fixed-width vectorization can be deleted if the trip
6308 // count is less than the epilogue iterations. That's why we prefer
6309 // fixed-width vectorization in epilogue in case of equal costs.
6310 if (IsEpilogue)
6311 return true;
6312 return ST->useFixedOverScalableIfEqualCost();
6313}
6314
6316 return ST->getEpilogueVectorizationMinVF();
6317}
6318
6320 if (!ST->hasSVE())
6321 return false;
6322
6323 // We don't currently support vectorisation with interleaving for SVE - with
6324 // such loops we're better off not using tail-folding. This gives us a chance
6325 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6326 if (TFI->IAI->hasGroups())
6327 return false;
6328
6330 if (TFI->LVL->getReductionVars().size())
6332 if (TFI->LVL->getFixedOrderRecurrences().size())
6334
6335 // We call this to discover whether any load/store pointers in the loop have
6336 // negative strides. This will require extra work to reverse the loop
6337 // predicate, which may be expensive.
6340 *TFI->LVL->getDominatorTree()))
6344
6345 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6346 Required))
6347 return false;
6348
6349 // Don't tail-fold for tight loops where we would be better off interleaving
6350 // with an unpredicated loop.
6351 unsigned NumInsns = 0;
6352 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6353 NumInsns += BB->sizeWithoutDebug();
6354 }
6355
6356 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6357 return NumInsns >= SVETailFoldInsnThreshold;
6358}
6359
6362 StackOffset BaseOffset, bool HasBaseReg,
6363 int64_t Scale, unsigned AddrSpace) const {
6364 // Scaling factors are not free at all.
6365 // Operands | Rt Latency
6366 // -------------------------------------------
6367 // Rt, [Xn, Xm] | 4
6368 // -------------------------------------------
6369 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6370 // Rt, [Xn, Wm, <extend> #imm] |
6372 AM.BaseGV = BaseGV;
6373 AM.BaseOffs = BaseOffset.getFixed();
6374 AM.HasBaseReg = HasBaseReg;
6375 AM.Scale = Scale;
6376 AM.ScalableOffset = BaseOffset.getScalable();
6377 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6378 // Scale represents reg2 * scale, thus account for 1 if
6379 // it is not equal to 0 or 1.
6380 return AM.Scale != 0 && AM.Scale != 1;
6382}
6383
6385 const Instruction *I) const {
6387 // For the binary operators (e.g. or) we need to be more careful than
6388 // selects, here we only transform them if they are already at a natural
6389 // break point in the code - the end of a block with an unconditional
6390 // terminator.
6391 if (I->getOpcode() == Instruction::Or &&
6392 isa<BranchInst>(I->getNextNode()) &&
6393 cast<BranchInst>(I->getNextNode())->isUnconditional())
6394 return true;
6395
6396 if (I->getOpcode() == Instruction::Add ||
6397 I->getOpcode() == Instruction::Sub)
6398 return true;
6399 }
6401}
6402
6405 const TargetTransformInfo::LSRCost &C2) const {
6406 // AArch64 specific here is adding the number of instructions to the
6407 // comparison (though not as the first consideration, as some targets do)
6408 // along with changing the priority of the base additions.
6409 // TODO: Maybe a more nuanced tradeoff between instruction count
6410 // and number of registers? To be investigated at a later date.
6411 if (EnableLSRCostOpt)
6412 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6413 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6414 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6415 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6416
6418}
6419
6420static bool isSplatShuffle(Value *V) {
6421 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6422 return all_equal(Shuf->getShuffleMask());
6423 return false;
6424}
6425
6426/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6427/// or upper half of the vector elements.
6428static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6429 bool AllowSplat = false) {
6430 // Scalable types can't be extract shuffle vectors.
6431 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6432 return false;
6433
6434 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6435 auto *FullTy = FullV->getType();
6436 auto *HalfTy = HalfV->getType();
6437 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6438 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6439 };
6440
6441 auto extractHalf = [](Value *FullV, Value *HalfV) {
6442 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6443 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6444 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6445 };
6446
6447 ArrayRef<int> M1, M2;
6448 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6449 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6450 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6451 return false;
6452
6453 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6454 // it is not checked as an extract below.
6455 if (AllowSplat && isSplatShuffle(Op1))
6456 S1Op1 = nullptr;
6457 if (AllowSplat && isSplatShuffle(Op2))
6458 S2Op1 = nullptr;
6459
6460 // Check that the operands are half as wide as the result and we extract
6461 // half of the elements of the input vectors.
6462 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6463 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6464 return false;
6465
6466 // Check the mask extracts either the lower or upper half of vector
6467 // elements.
6468 int M1Start = 0;
6469 int M2Start = 0;
6470 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6471 if ((S1Op1 &&
6472 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6473 (S2Op1 &&
6474 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6475 return false;
6476
6477 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6478 (M2Start != 0 && M2Start != (NumElements / 2)))
6479 return false;
6480 if (S1Op1 && S2Op1 && M1Start != M2Start)
6481 return false;
6482
6483 return true;
6484}
6485
6486/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6487/// of the vector elements.
6488static bool areExtractExts(Value *Ext1, Value *Ext2) {
6489 auto areExtDoubled = [](Instruction *Ext) {
6490 return Ext->getType()->getScalarSizeInBits() ==
6491 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6492 };
6493
6494 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6495 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6496 !areExtDoubled(cast<Instruction>(Ext1)) ||
6497 !areExtDoubled(cast<Instruction>(Ext2)))
6498 return false;
6499
6500 return true;
6501}
6502
6503/// Check if Op could be used with vmull_high_p64 intrinsic.
6505 Value *VectorOperand = nullptr;
6506 ConstantInt *ElementIndex = nullptr;
6507 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6508 m_ConstantInt(ElementIndex))) &&
6509 ElementIndex->getValue() == 1 &&
6510 isa<FixedVectorType>(VectorOperand->getType()) &&
6511 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6512}
6513
6514/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6515static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6517}
6518
6520 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6521 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6522 if (!GEP || GEP->getNumOperands() != 2)
6523 return false;
6524
6525 Value *Base = GEP->getOperand(0);
6526 Value *Offsets = GEP->getOperand(1);
6527
6528 // We only care about scalar_base+vector_offsets.
6529 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6530 return false;
6531
6532 // Sink extends that would allow us to use 32-bit offset vectors.
6533 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6534 auto *OffsetsInst = cast<Instruction>(Offsets);
6535 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6536 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6537 Ops.push_back(&GEP->getOperandUse(1));
6538 }
6539
6540 // Sink the GEP.
6541 return true;
6542}
6543
6544/// We want to sink following cases:
6545/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6546/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6548 if (match(Op, m_VScale()))
6549 return true;
6550 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6552 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6553 return true;
6554 }
6555 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6557 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6558 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6559 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6560 return true;
6561 }
6562 return false;
6563}
6564
6565/// Check if sinking \p I's operands to I's basic block is profitable, because
6566/// the operands can be folded into a target instruction, e.g.
6567/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6571 switch (II->getIntrinsicID()) {
6572 case Intrinsic::aarch64_neon_smull:
6573 case Intrinsic::aarch64_neon_umull:
6574 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6575 /*AllowSplat=*/true)) {
6576 Ops.push_back(&II->getOperandUse(0));
6577 Ops.push_back(&II->getOperandUse(1));
6578 return true;
6579 }
6580 [[fallthrough]];
6581
6582 case Intrinsic::fma:
6583 case Intrinsic::fmuladd:
6584 if (isa<VectorType>(I->getType()) &&
6585 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6586 !ST->hasFullFP16())
6587 return false;
6588 [[fallthrough]];
6589 case Intrinsic::aarch64_neon_sqdmull:
6590 case Intrinsic::aarch64_neon_sqdmulh:
6591 case Intrinsic::aarch64_neon_sqrdmulh:
6592 // Sink splats for index lane variants
6593 if (isSplatShuffle(II->getOperand(0)))
6594 Ops.push_back(&II->getOperandUse(0));
6595 if (isSplatShuffle(II->getOperand(1)))
6596 Ops.push_back(&II->getOperandUse(1));
6597 return !Ops.empty();
6598 case Intrinsic::aarch64_neon_fmlal:
6599 case Intrinsic::aarch64_neon_fmlal2:
6600 case Intrinsic::aarch64_neon_fmlsl:
6601 case Intrinsic::aarch64_neon_fmlsl2:
6602 // Sink splats for index lane variants
6603 if (isSplatShuffle(II->getOperand(1)))
6604 Ops.push_back(&II->getOperandUse(1));
6605 if (isSplatShuffle(II->getOperand(2)))
6606 Ops.push_back(&II->getOperandUse(2));
6607 return !Ops.empty();
6608 case Intrinsic::aarch64_sve_ptest_first:
6609 case Intrinsic::aarch64_sve_ptest_last:
6610 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6611 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6612 Ops.push_back(&II->getOperandUse(0));
6613 return !Ops.empty();
6614 case Intrinsic::aarch64_sme_write_horiz:
6615 case Intrinsic::aarch64_sme_write_vert:
6616 case Intrinsic::aarch64_sme_writeq_horiz:
6617 case Intrinsic::aarch64_sme_writeq_vert: {
6618 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6619 if (!Idx || Idx->getOpcode() != Instruction::Add)
6620 return false;
6621 Ops.push_back(&II->getOperandUse(1));
6622 return true;
6623 }
6624 case Intrinsic::aarch64_sme_read_horiz:
6625 case Intrinsic::aarch64_sme_read_vert:
6626 case Intrinsic::aarch64_sme_readq_horiz:
6627 case Intrinsic::aarch64_sme_readq_vert:
6628 case Intrinsic::aarch64_sme_ld1b_vert:
6629 case Intrinsic::aarch64_sme_ld1h_vert:
6630 case Intrinsic::aarch64_sme_ld1w_vert:
6631 case Intrinsic::aarch64_sme_ld1d_vert:
6632 case Intrinsic::aarch64_sme_ld1q_vert:
6633 case Intrinsic::aarch64_sme_st1b_vert:
6634 case Intrinsic::aarch64_sme_st1h_vert:
6635 case Intrinsic::aarch64_sme_st1w_vert:
6636 case Intrinsic::aarch64_sme_st1d_vert:
6637 case Intrinsic::aarch64_sme_st1q_vert:
6638 case Intrinsic::aarch64_sme_ld1b_horiz:
6639 case Intrinsic::aarch64_sme_ld1h_horiz:
6640 case Intrinsic::aarch64_sme_ld1w_horiz:
6641 case Intrinsic::aarch64_sme_ld1d_horiz:
6642 case Intrinsic::aarch64_sme_ld1q_horiz:
6643 case Intrinsic::aarch64_sme_st1b_horiz:
6644 case Intrinsic::aarch64_sme_st1h_horiz:
6645 case Intrinsic::aarch64_sme_st1w_horiz:
6646 case Intrinsic::aarch64_sme_st1d_horiz:
6647 case Intrinsic::aarch64_sme_st1q_horiz: {
6648 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6649 if (!Idx || Idx->getOpcode() != Instruction::Add)
6650 return false;
6651 Ops.push_back(&II->getOperandUse(3));
6652 return true;
6653 }
6654 case Intrinsic::aarch64_neon_pmull:
6655 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6656 return false;
6657 Ops.push_back(&II->getOperandUse(0));
6658 Ops.push_back(&II->getOperandUse(1));
6659 return true;
6660 case Intrinsic::aarch64_neon_pmull64:
6661 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6662 II->getArgOperand(1)))
6663 return false;
6664 Ops.push_back(&II->getArgOperandUse(0));
6665 Ops.push_back(&II->getArgOperandUse(1));
6666 return true;
6667 case Intrinsic::masked_gather:
6668 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6669 return false;
6670 Ops.push_back(&II->getArgOperandUse(0));
6671 return true;
6672 case Intrinsic::masked_scatter:
6673 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6674 return false;
6675 Ops.push_back(&II->getArgOperandUse(1));
6676 return true;
6677 default:
6678 return false;
6679 }
6680 }
6681
6682 auto ShouldSinkCondition = [](Value *Cond,
6683 SmallVectorImpl<Use *> &Ops) -> bool {
6685 return false;
6687 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6688 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6689 return false;
6690 if (isa<CmpInst>(II->getOperand(0)))
6691 Ops.push_back(&II->getOperandUse(0));
6692 return true;
6693 };
6694
6695 switch (I->getOpcode()) {
6696 case Instruction::GetElementPtr:
6697 case Instruction::Add:
6698 case Instruction::Sub:
6699 // Sink vscales closer to uses for better isel
6700 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6701 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6702 Ops.push_back(&I->getOperandUse(Op));
6703 return true;
6704 }
6705 }
6706 break;
6707 case Instruction::Select: {
6708 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6709 return false;
6710
6711 Ops.push_back(&I->getOperandUse(0));
6712 return true;
6713 }
6714 case Instruction::Br: {
6715 if (cast<BranchInst>(I)->isUnconditional())
6716 return false;
6717
6718 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6719 return false;
6720
6721 Ops.push_back(&I->getOperandUse(0));
6722 return true;
6723 }
6724 default:
6725 break;
6726 }
6727
6728 if (!I->getType()->isVectorTy())
6729 return false;
6730
6731 switch (I->getOpcode()) {
6732 case Instruction::Sub:
6733 case Instruction::Add: {
6734 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6735 return false;
6736
6737 // If the exts' operands extract either the lower or upper elements, we
6738 // can sink them too.
6739 auto Ext1 = cast<Instruction>(I->getOperand(0));
6740 auto Ext2 = cast<Instruction>(I->getOperand(1));
6741 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6742 Ops.push_back(&Ext1->getOperandUse(0));
6743 Ops.push_back(&Ext2->getOperandUse(0));
6744 }
6745
6746 Ops.push_back(&I->getOperandUse(0));
6747 Ops.push_back(&I->getOperandUse(1));
6748
6749 return true;
6750 }
6751 case Instruction::Or: {
6752 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6753 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6754 if (ST->hasNEON()) {
6755 Instruction *OtherAnd, *IA, *IB;
6756 Value *MaskValue;
6757 // MainAnd refers to And instruction that has 'Not' as one of its operands
6758 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6759 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6760 m_Instruction(IA)))))) {
6761 if (match(OtherAnd,
6762 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6763 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6764 ? cast<Instruction>(I->getOperand(1))
6765 : cast<Instruction>(I->getOperand(0));
6766
6767 // Both Ands should be in same basic block as Or
6768 if (I->getParent() != MainAnd->getParent() ||
6769 I->getParent() != OtherAnd->getParent())
6770 return false;
6771
6772 // Non-mask operands of both Ands should also be in same basic block
6773 if (I->getParent() != IA->getParent() ||
6774 I->getParent() != IB->getParent())
6775 return false;
6776
6777 Ops.push_back(
6778 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6779 Ops.push_back(&I->getOperandUse(0));
6780 Ops.push_back(&I->getOperandUse(1));
6781
6782 return true;
6783 }
6784 }
6785 }
6786
6787 return false;
6788 }
6789 case Instruction::Mul: {
6790 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6791 auto *Ty = cast<VectorType>(V->getType());
6792 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6793 if (Ty->isScalableTy())
6794 return false;
6795
6796 // Indexed variants of Mul exist for i16 and i32 element types only.
6797 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6798 };
6799
6800 int NumZExts = 0, NumSExts = 0;
6801 for (auto &Op : I->operands()) {
6802 // Make sure we are not already sinking this operand
6803 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6804 continue;
6805
6806 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6807 auto *Ext = cast<Instruction>(Op);
6808 auto *ExtOp = Ext->getOperand(0);
6809 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6810 Ops.push_back(&Ext->getOperandUse(0));
6811 Ops.push_back(&Op);
6812
6813 if (isa<SExtInst>(Ext)) {
6814 NumSExts++;
6815 } else {
6816 NumZExts++;
6817 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6818 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6819 I->getType()->getScalarSizeInBits())
6820 NumSExts++;
6821 }
6822
6823 continue;
6824 }
6825
6827 if (!Shuffle)
6828 continue;
6829
6830 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6831 // operand and the s/zext can help create indexed s/umull. This is
6832 // especially useful to prevent i64 mul being scalarized.
6833 if (isSplatShuffle(Shuffle) &&
6834 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6835 Ops.push_back(&Shuffle->getOperandUse(0));
6836 Ops.push_back(&Op);
6837 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6838 NumSExts++;
6839 else
6840 NumZExts++;
6841 continue;
6842 }
6843
6844 Value *ShuffleOperand = Shuffle->getOperand(0);
6845 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6846 if (!Insert)
6847 continue;
6848
6849 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6850 if (!OperandInstr)
6851 continue;
6852
6853 ConstantInt *ElementConstant =
6854 dyn_cast<ConstantInt>(Insert->getOperand(2));
6855 // Check that the insertelement is inserting into element 0
6856 if (!ElementConstant || !ElementConstant->isZero())
6857 continue;
6858
6859 unsigned Opcode = OperandInstr->getOpcode();
6860 if (Opcode == Instruction::SExt)
6861 NumSExts++;
6862 else if (Opcode == Instruction::ZExt)
6863 NumZExts++;
6864 else {
6865 // If we find that the top bits are known 0, then we can sink and allow
6866 // the backend to generate a umull.
6867 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6868 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6869 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6870 continue;
6871 NumZExts++;
6872 }
6873
6874 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6875 // the And, just to hoist it again back to the load.
6876 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6877 Ops.push_back(&Insert->getOperandUse(1));
6878 Ops.push_back(&Shuffle->getOperandUse(0));
6879 Ops.push_back(&Op);
6880 }
6881
6882 // It is profitable to sink if we found two of the same type of extends.
6883 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6884 return true;
6885
6886 // Otherwise, see if we should sink splats for indexed variants.
6887 if (!ShouldSinkSplatForIndexedVariant(I))
6888 return false;
6889
6890 Ops.clear();
6891 if (isSplatShuffle(I->getOperand(0)))
6892 Ops.push_back(&I->getOperandUse(0));
6893 if (isSplatShuffle(I->getOperand(1)))
6894 Ops.push_back(&I->getOperandUse(1));
6895
6896 return !Ops.empty();
6897 }
6898 case Instruction::FMul: {
6899 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6900 if (I->getType()->isScalableTy())
6901 return false;
6902
6903 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6904 !ST->hasFullFP16())
6905 return false;
6906
6907 // Sink splats for index lane variants
6908 if (isSplatShuffle(I->getOperand(0)))
6909 Ops.push_back(&I->getOperandUse(0));
6910 if (isSplatShuffle(I->getOperand(1)))
6911 Ops.push_back(&I->getOperandUse(1));
6912 return !Ops.empty();
6913 }
6914 default:
6915 return false;
6916 }
6917 return false;
6918}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned countLeadingOnes() const
Definition APInt.h:1625
void negate()
Negate this APInt in place.
Definition APInt.h:1469
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:760
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...