LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
80namespace {
81class TailFoldingOption {
82 // These bitfields will only ever be set to something non-zero in operator=,
83 // when setting the -sve-tail-folding option. This option should always be of
84 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
85 // InitialBits is one of (disabled|all|simple). EnableBits represents
86 // additional flags we're enabling, and DisableBits for those flags we're
87 // disabling. The default flag is tracked in the variable NeedsDefault, since
88 // at the time of setting the option we may not know what the default value
89 // for the CPU is.
93
94 // This value needs to be initialised to true in case the user does not
95 // explicitly set the -sve-tail-folding option.
96 bool NeedsDefault = true;
97
98 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
99
100 void setNeedsDefault(bool V) { NeedsDefault = V; }
101
102 void setEnableBit(TailFoldingOpts Bit) {
103 EnableBits |= Bit;
104 DisableBits &= ~Bit;
105 }
106
107 void setDisableBit(TailFoldingOpts Bit) {
108 EnableBits &= ~Bit;
109 DisableBits |= Bit;
110 }
111
112 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
113 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
114
115 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
116 "Initial bits should only include one of "
117 "(disabled|all|simple|default)");
118 Bits = NeedsDefault ? DefaultBits : InitialBits;
119 Bits |= EnableBits;
120 Bits &= ~DisableBits;
121
122 return Bits;
123 }
124
125 void reportError(std::string Opt) {
126 errs() << "invalid argument '" << Opt
127 << "' to -sve-tail-folding=; the option should be of the form\n"
128 " (disabled|all|default|simple)[+(reductions|recurrences"
129 "|reverse|noreductions|norecurrences|noreverse)]\n";
130 report_fatal_error("Unrecognised tail-folding option");
131 }
132
133public:
134
135 void operator=(const std::string &Val) {
136 // If the user explicitly sets -sve-tail-folding= then treat as an error.
137 if (Val.empty()) {
138 reportError("");
139 return;
140 }
141
142 // Since the user is explicitly setting the option we don't automatically
143 // need the default unless they require it.
144 setNeedsDefault(false);
145
146 SmallVector<StringRef, 4> TailFoldTypes;
147 StringRef(Val).split(TailFoldTypes, '+', -1, false);
148
149 unsigned StartIdx = 1;
150 if (TailFoldTypes[0] == "disabled")
151 setInitialBits(TailFoldingOpts::Disabled);
152 else if (TailFoldTypes[0] == "all")
153 setInitialBits(TailFoldingOpts::All);
154 else if (TailFoldTypes[0] == "default")
155 setNeedsDefault(true);
156 else if (TailFoldTypes[0] == "simple")
157 setInitialBits(TailFoldingOpts::Simple);
158 else {
159 StartIdx = 0;
160 setInitialBits(TailFoldingOpts::Disabled);
161 }
162
163 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
164 if (TailFoldTypes[I] == "reductions")
165 setEnableBit(TailFoldingOpts::Reductions);
166 else if (TailFoldTypes[I] == "recurrences")
167 setEnableBit(TailFoldingOpts::Recurrences);
168 else if (TailFoldTypes[I] == "reverse")
169 setEnableBit(TailFoldingOpts::Reverse);
170 else if (TailFoldTypes[I] == "noreductions")
171 setDisableBit(TailFoldingOpts::Reductions);
172 else if (TailFoldTypes[I] == "norecurrences")
173 setDisableBit(TailFoldingOpts::Recurrences);
174 else if (TailFoldTypes[I] == "noreverse")
175 setDisableBit(TailFoldingOpts::Reverse);
176 else
177 reportError(Val);
178 }
179 }
180
181 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
182 return (getBits(DefaultBits) & Required) == Required;
183 }
184};
185} // namespace
186
187TailFoldingOption TailFoldingOptionLoc;
188
190 "sve-tail-folding",
191 cl::desc(
192 "Control the use of vectorisation using tail-folding for SVE where the"
193 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
194 "\ndisabled (Initial) No loop types will vectorize using "
195 "tail-folding"
196 "\ndefault (Initial) Uses the default tail-folding settings for "
197 "the target CPU"
198 "\nall (Initial) All legal loop types will vectorize using "
199 "tail-folding"
200 "\nsimple (Initial) Use tail-folding for simple loops (not "
201 "reductions or recurrences)"
202 "\nreductions Use tail-folding for loops containing reductions"
203 "\nnoreductions Inverse of above"
204 "\nrecurrences Use tail-folding for loops containing fixed order "
205 "recurrences"
206 "\nnorecurrences Inverse of above"
207 "\nreverse Use tail-folding for loops requiring reversed "
208 "predicates"
209 "\nnoreverse Inverse of above"),
211
212// Experimental option that will only be fully functional when the
213// code-generator is changed to use SVE instead of NEON for all fixed-width
214// operations.
216 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
217
218// Experimental option that will only be fully functional when the cost-model
219// and code-generator have been changed to avoid using scalable vector
220// instructions that are not legal in streaming SVE mode.
222 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
223
224static bool isSMEABIRoutineCall(const CallInst &CI,
225 const AArch64TargetLowering &TLI) {
226 const auto *F = CI.getCalledFunction();
227 return F &&
229}
230
231/// Returns true if the function has explicit operations that can only be
232/// lowered using incompatible instructions for the selected mode. This also
233/// returns true if the function F may use or modify ZA state.
235 const AArch64TargetLowering &TLI) {
236 for (const BasicBlock &BB : *F) {
237 for (const Instruction &I : BB) {
238 // Be conservative for now and assume that any call to inline asm or to
239 // intrinsics could could result in non-streaming ops (e.g. calls to
240 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
241 // all native LLVM instructions can be lowered to compatible instructions.
242 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
243 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245 return true;
246 }
247 }
248 return false;
249}
250
252 StringRef AttributeStr =
253 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
254 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
256 FeatureStr.split(Features, ",");
257 return AArch64::getFMVPriority(Features);
258}
259
261 return F.hasFnAttribute("fmv-features");
262}
263
264const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
265 AArch64::FeatureExecuteOnly,
266};
267
269 const Function *Callee) const {
270 SMECallAttrs CallAttrs(*Caller, *Callee);
271
272 // Never inline a function explicitly marked as being streaming,
273 // into a non-streaming function. Assume it was marked as streaming
274 // for a reason.
275 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
277 return false;
278
279 // When inlining, we should consider the body of the function, not the
280 // interface.
281 if (CallAttrs.callee().hasStreamingBody()) {
282 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
283 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
284 }
285
286 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
287 return false;
288
289 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
290 CallAttrs.requiresPreservingZT0() ||
291 CallAttrs.requiresPreservingAllZAState()) {
292 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
293 return false;
294 }
295
296 const TargetMachine &TM = getTLI()->getTargetMachine();
297 const FeatureBitset &CallerBits =
298 TM.getSubtargetImpl(*Caller)->getFeatureBits();
299 const FeatureBitset &CalleeBits =
300 TM.getSubtargetImpl(*Callee)->getFeatureBits();
301 // Adjust the feature bitsets by inverting some of the bits. This is needed
302 // for target features that represent restrictions rather than capabilities,
303 // for example a "+execute-only" callee can be inlined into a caller without
304 // "+execute-only", but not vice versa.
305 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
306 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
307
308 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
309}
310
312 const Function *Caller, const Function *Callee,
313 const ArrayRef<Type *> &Types) const {
314 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
315 return false;
316
317 // We need to ensure that argument promotion does not attempt to promote
318 // pointers to fixed-length vector types larger than 128 bits like
319 // <8 x float> (and pointers to aggregate types which have such fixed-length
320 // vector type members) into the values of the pointees. Such vector types
321 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
322 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
323 // types can be safely treated as 128-bit NEON types and they cannot be
324 // distinguished in IR.
325 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
326 auto FVTy = dyn_cast<FixedVectorType>(Ty);
327 return FVTy &&
328 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
329 }))
330 return false;
331
332 return true;
333}
334
335unsigned
337 unsigned DefaultCallPenalty) const {
338 // This function calculates a penalty for executing Call in F.
339 //
340 // There are two ways this function can be called:
341 // (1) F:
342 // call from F -> G (the call here is Call)
343 //
344 // For (1), Call.getCaller() == F, so it will always return a high cost if
345 // a streaming-mode change is required (thus promoting the need to inline the
346 // function)
347 //
348 // (2) F:
349 // call from F -> G (the call here is not Call)
350 // G:
351 // call from G -> H (the call here is Call)
352 //
353 // For (2), if after inlining the body of G into F the call to H requires a
354 // streaming-mode change, and the call to G from F would also require a
355 // streaming-mode change, then there is benefit to do the streaming-mode
356 // change only once and avoid inlining of G into F.
357
358 SMEAttrs FAttrs(*F);
359 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
360
361 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
362 if (F == Call.getCaller()) // (1)
363 return CallPenaltyChangeSM * DefaultCallPenalty;
364 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
365 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
366 }
367
368 return DefaultCallPenalty;
369}
370
377
378/// Calculate the cost of materializing a 64-bit value. This helper
379/// method might only calculate a fraction of a larger immediate. Therefore it
380/// is valid to return a cost of ZERO.
382 // Check if the immediate can be encoded within an instruction.
383 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
384 return 0;
385
386 if (Val < 0)
387 Val = ~Val;
388
389 // Calculate how many moves we will need to materialize this constant.
391 AArch64_IMM::expandMOVImm(Val, 64, Insn);
392 return Insn.size();
393}
394
395/// Calculate the cost of materializing the given constant.
399 assert(Ty->isIntegerTy());
400
401 unsigned BitSize = Ty->getPrimitiveSizeInBits();
402 if (BitSize == 0)
403 return ~0U;
404
405 // Sign-extend all constants to a multiple of 64-bit.
406 APInt ImmVal = Imm;
407 if (BitSize & 0x3f)
408 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
409
410 // Split the constant into 64-bit chunks and calculate the cost for each
411 // chunk.
413 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
414 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
415 int64_t Val = Tmp.getSExtValue();
416 Cost += getIntImmCost(Val);
417 }
418 // We need at least one instruction to materialze the constant.
419 return std::max<InstructionCost>(1, Cost);
420}
421
423 const APInt &Imm, Type *Ty,
425 Instruction *Inst) const {
426 assert(Ty->isIntegerTy());
427
428 unsigned BitSize = Ty->getPrimitiveSizeInBits();
429 // There is no cost model for constants with a bit size of 0. Return TCC_Free
430 // here, so that constant hoisting will ignore this constant.
431 if (BitSize == 0)
432 return TTI::TCC_Free;
433
434 unsigned ImmIdx = ~0U;
435 switch (Opcode) {
436 default:
437 return TTI::TCC_Free;
438 case Instruction::GetElementPtr:
439 // Always hoist the base address of a GetElementPtr.
440 if (Idx == 0)
441 return 2 * TTI::TCC_Basic;
442 return TTI::TCC_Free;
443 case Instruction::Store:
444 ImmIdx = 0;
445 break;
446 case Instruction::Add:
447 case Instruction::Sub:
448 case Instruction::Mul:
449 case Instruction::UDiv:
450 case Instruction::SDiv:
451 case Instruction::URem:
452 case Instruction::SRem:
453 case Instruction::And:
454 case Instruction::Or:
455 case Instruction::Xor:
456 case Instruction::ICmp:
457 ImmIdx = 1;
458 break;
459 // Always return TCC_Free for the shift value of a shift instruction.
460 case Instruction::Shl:
461 case Instruction::LShr:
462 case Instruction::AShr:
463 if (Idx == 1)
464 return TTI::TCC_Free;
465 break;
466 case Instruction::Trunc:
467 case Instruction::ZExt:
468 case Instruction::SExt:
469 case Instruction::IntToPtr:
470 case Instruction::PtrToInt:
471 case Instruction::BitCast:
472 case Instruction::PHI:
473 case Instruction::Call:
474 case Instruction::Select:
475 case Instruction::Ret:
476 case Instruction::Load:
477 break;
478 }
479
480 if (Idx == ImmIdx) {
481 int NumConstants = (BitSize + 63) / 64;
483 return (Cost <= NumConstants * TTI::TCC_Basic)
484 ? static_cast<int>(TTI::TCC_Free)
485 : Cost;
486 }
488}
489
492 const APInt &Imm, Type *Ty,
494 assert(Ty->isIntegerTy());
495
496 unsigned BitSize = Ty->getPrimitiveSizeInBits();
497 // There is no cost model for constants with a bit size of 0. Return TCC_Free
498 // here, so that constant hoisting will ignore this constant.
499 if (BitSize == 0)
500 return TTI::TCC_Free;
501
502 // Most (all?) AArch64 intrinsics do not support folding immediates into the
503 // selected instruction, so we compute the materialization cost for the
504 // immediate directly.
505 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
507
508 switch (IID) {
509 default:
510 return TTI::TCC_Free;
511 case Intrinsic::sadd_with_overflow:
512 case Intrinsic::uadd_with_overflow:
513 case Intrinsic::ssub_with_overflow:
514 case Intrinsic::usub_with_overflow:
515 case Intrinsic::smul_with_overflow:
516 case Intrinsic::umul_with_overflow:
517 if (Idx == 1) {
518 int NumConstants = (BitSize + 63) / 64;
520 return (Cost <= NumConstants * TTI::TCC_Basic)
521 ? static_cast<int>(TTI::TCC_Free)
522 : Cost;
523 }
524 break;
525 case Intrinsic::experimental_stackmap:
526 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
527 return TTI::TCC_Free;
528 break;
529 case Intrinsic::experimental_patchpoint_void:
530 case Intrinsic::experimental_patchpoint:
531 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
532 return TTI::TCC_Free;
533 break;
534 case Intrinsic::experimental_gc_statepoint:
535 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 }
540}
541
543AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
544 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
545 if (TyWidth == 32 || TyWidth == 64)
547 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
548 return TTI::PSK_Software;
549}
550
551static bool isUnpackedVectorVT(EVT VecVT) {
552 return VecVT.isScalableVector() &&
554}
555
557 const IntrinsicCostAttributes &ICA) {
558 // We need to know at least the number of elements in the vector of buckets
559 // and the size of each element to update.
560 if (ICA.getArgTypes().size() < 2)
562
563 // Only interested in costing for the hardware instruction from SVE2.
564 if (!ST->hasSVE2())
566
567 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
568 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
569 unsigned TotalHistCnts = 1;
570
571 unsigned EltSize = EltTy->getScalarSizeInBits();
572 // Only allow (up to 64b) integers or pointers
573 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
575
576 // FIXME: We should be able to generate histcnt for fixed-length vectors
577 // using ptrue with a specific VL.
578 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
579 unsigned EC = VTy->getElementCount().getKnownMinValue();
580 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
582
583 // HistCnt only supports 32b and 64b element types
584 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
585
586 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
588
589 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
590 TotalHistCnts = EC / NaturalVectorWidth;
591
592 return InstructionCost(BaseHistCntCost * TotalHistCnts);
593 }
594
596}
597
601 // The code-generator is currently not able to handle scalable vectors
602 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
603 // it. This change will be removed when code-generation for these types is
604 // sufficiently reliable.
605 auto *RetTy = ICA.getReturnType();
606 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
607 if (VTy->getElementCount() == ElementCount::getScalable(1))
609
610 switch (ICA.getID()) {
611 case Intrinsic::experimental_vector_histogram_add: {
612 InstructionCost HistCost = getHistogramCost(ST, ICA);
613 // If the cost isn't valid, we may still be able to scalarize
614 if (HistCost.isValid())
615 return HistCost;
616 break;
617 }
618 case Intrinsic::umin:
619 case Intrinsic::umax:
620 case Intrinsic::smin:
621 case Intrinsic::smax: {
622 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
623 MVT::v8i16, MVT::v2i32, MVT::v4i32,
624 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
625 MVT::nxv2i64};
626 auto LT = getTypeLegalizationCost(RetTy);
627 // v2i64 types get converted to cmp+bif hence the cost of 2
628 if (LT.second == MVT::v2i64)
629 return LT.first * 2;
630 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
631 return LT.first;
632 break;
633 }
634 case Intrinsic::sadd_sat:
635 case Intrinsic::ssub_sat:
636 case Intrinsic::uadd_sat:
637 case Intrinsic::usub_sat: {
638 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
639 MVT::v8i16, MVT::v2i32, MVT::v4i32,
640 MVT::v2i64};
641 auto LT = getTypeLegalizationCost(RetTy);
642 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
643 // need to extend the type, as it uses shr(qadd(shl, shl)).
644 unsigned Instrs =
645 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
646 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
647 return LT.first * Instrs;
648
650 uint64_t VectorSize = TS.getKnownMinValue();
651
652 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
653 return LT.first * Instrs;
654
655 break;
656 }
657 case Intrinsic::abs: {
658 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
659 MVT::v8i16, MVT::v2i32, MVT::v4i32,
660 MVT::v2i64};
661 auto LT = getTypeLegalizationCost(RetTy);
662 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
663 return LT.first;
664 break;
665 }
666 case Intrinsic::bswap: {
667 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
668 MVT::v4i32, MVT::v2i64};
669 auto LT = getTypeLegalizationCost(RetTy);
670 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
671 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
672 return LT.first;
673 break;
674 }
675 case Intrinsic::fma:
676 case Intrinsic::fmuladd: {
677 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
678 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
679 Type *EltTy = RetTy->getScalarType();
680 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
681 (EltTy->isHalfTy() && ST->hasFullFP16()))
682 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
683 break;
684 }
685 case Intrinsic::stepvector: {
686 InstructionCost Cost = 1; // Cost of the `index' instruction
687 auto LT = getTypeLegalizationCost(RetTy);
688 // Legalisation of illegal vectors involves an `index' instruction plus
689 // (LT.first - 1) vector adds.
690 if (LT.first > 1) {
691 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
692 InstructionCost AddCost =
693 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
694 Cost += AddCost * (LT.first - 1);
695 }
696 return Cost;
697 }
698 case Intrinsic::vector_extract:
699 case Intrinsic::vector_insert: {
700 // If both the vector and subvector types are legal types and the index
701 // is 0, then this should be a no-op or simple operation; return a
702 // relatively low cost.
703
704 // If arguments aren't actually supplied, then we cannot determine the
705 // value of the index. We also want to skip predicate types.
706 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
708 break;
709
710 LLVMContext &C = RetTy->getContext();
711 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
712 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
713 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
714 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
715 // Skip this if either the vector or subvector types are unpacked
716 // SVE types; they may get lowered to stack stores and loads.
717 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
718 break;
719
721 getTLI()->getTypeConversion(C, SubVecVT);
723 getTLI()->getTypeConversion(C, VecVT);
724 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
725 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
726 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
727 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
728 return TTI::TCC_Free;
729 break;
730 }
731 case Intrinsic::bitreverse: {
732 static const CostTblEntry BitreverseTbl[] = {
733 {Intrinsic::bitreverse, MVT::i32, 1},
734 {Intrinsic::bitreverse, MVT::i64, 1},
735 {Intrinsic::bitreverse, MVT::v8i8, 1},
736 {Intrinsic::bitreverse, MVT::v16i8, 1},
737 {Intrinsic::bitreverse, MVT::v4i16, 2},
738 {Intrinsic::bitreverse, MVT::v8i16, 2},
739 {Intrinsic::bitreverse, MVT::v2i32, 2},
740 {Intrinsic::bitreverse, MVT::v4i32, 2},
741 {Intrinsic::bitreverse, MVT::v1i64, 2},
742 {Intrinsic::bitreverse, MVT::v2i64, 2},
743 };
744 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
745 const auto *Entry =
746 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
747 if (Entry) {
748 // Cost Model is using the legal type(i32) that i8 and i16 will be
749 // converted to +1 so that we match the actual lowering cost
750 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
751 TLI->getValueType(DL, RetTy, true) == MVT::i16)
752 return LegalisationCost.first * Entry->Cost + 1;
753
754 return LegalisationCost.first * Entry->Cost;
755 }
756 break;
757 }
758 case Intrinsic::ctpop: {
759 if (!ST->hasNEON()) {
760 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
761 return getTypeLegalizationCost(RetTy).first * 12;
762 }
763 static const CostTblEntry CtpopCostTbl[] = {
764 {ISD::CTPOP, MVT::v2i64, 4},
765 {ISD::CTPOP, MVT::v4i32, 3},
766 {ISD::CTPOP, MVT::v8i16, 2},
767 {ISD::CTPOP, MVT::v16i8, 1},
768 {ISD::CTPOP, MVT::i64, 4},
769 {ISD::CTPOP, MVT::v2i32, 3},
770 {ISD::CTPOP, MVT::v4i16, 2},
771 {ISD::CTPOP, MVT::v8i8, 1},
772 {ISD::CTPOP, MVT::i32, 5},
773 };
774 auto LT = getTypeLegalizationCost(RetTy);
775 MVT MTy = LT.second;
776 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
777 // Extra cost of +1 when illegal vector types are legalized by promoting
778 // the integer type.
779 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
780 RetTy->getScalarSizeInBits()
781 ? 1
782 : 0;
783 return LT.first * Entry->Cost + ExtraCost;
784 }
785 break;
786 }
787 case Intrinsic::sadd_with_overflow:
788 case Intrinsic::uadd_with_overflow:
789 case Intrinsic::ssub_with_overflow:
790 case Intrinsic::usub_with_overflow:
791 case Intrinsic::smul_with_overflow:
792 case Intrinsic::umul_with_overflow: {
793 static const CostTblEntry WithOverflowCostTbl[] = {
794 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
795 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
796 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
797 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
798 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
799 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
800 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
801 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
802 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
803 {Intrinsic::usub_with_overflow, MVT::i8, 3},
804 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
805 {Intrinsic::usub_with_overflow, MVT::i16, 3},
806 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
807 {Intrinsic::usub_with_overflow, MVT::i32, 1},
808 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
809 {Intrinsic::usub_with_overflow, MVT::i64, 1},
810 {Intrinsic::smul_with_overflow, MVT::i8, 5},
811 {Intrinsic::umul_with_overflow, MVT::i8, 4},
812 {Intrinsic::smul_with_overflow, MVT::i16, 5},
813 {Intrinsic::umul_with_overflow, MVT::i16, 4},
814 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
815 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
816 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
817 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
818 };
819 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
820 if (MTy.isSimple())
821 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
822 MTy.getSimpleVT()))
823 return Entry->Cost;
824 break;
825 }
826 case Intrinsic::fptosi_sat:
827 case Intrinsic::fptoui_sat: {
828 if (ICA.getArgTypes().empty())
829 break;
830 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
831 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
832 EVT MTy = TLI->getValueType(DL, RetTy);
833 // Check for the legal types, which are where the size of the input and the
834 // output are the same, or we are using cvt f64->i32 or f32->i64.
835 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
836 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
837 LT.second == MVT::v2f64)) {
838 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
839 (LT.second == MVT::f64 && MTy == MVT::i32) ||
840 (LT.second == MVT::f32 && MTy == MVT::i64)))
841 return LT.first;
842 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
843 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
844 MTy.getScalarSizeInBits() == 64)
845 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
846 }
847 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
848 // f32.
849 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
850 return LT.first + getIntrinsicInstrCost(
851 {ICA.getID(),
852 RetTy,
853 {ICA.getArgTypes()[0]->getWithNewType(
854 Type::getFloatTy(RetTy->getContext()))}},
855 CostKind);
856 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
857 (LT.second == MVT::f16 && MTy == MVT::i64) ||
858 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
859 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
860 return LT.first;
861 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
862 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
863 MTy.getScalarSizeInBits() == 32)
864 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
865 // Extending vector types v8f16->v8i32. These current scalarize but the
866 // codegen could be better.
867 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
868 MTy.getScalarSizeInBits() == 64)
869 return MTy.getVectorNumElements() * 3;
870
871 // If we can we use a legal convert followed by a min+max
872 if ((LT.second.getScalarType() == MVT::f32 ||
873 LT.second.getScalarType() == MVT::f64 ||
874 LT.second.getScalarType() == MVT::f16) &&
875 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
876 Type *LegalTy =
877 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
878 if (LT.second.isVector())
879 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
881 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
882 LegalTy, {LegalTy, LegalTy});
884 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
885 LegalTy, {LegalTy, LegalTy});
887 return LT.first * Cost +
888 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
889 : 1);
890 }
891 // Otherwise we need to follow the default expansion that clamps the value
892 // using a float min/max with a fcmp+sel for nan handling when signed.
893 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
894 RetTy = RetTy->getScalarType();
895 if (LT.second.isVector()) {
896 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
897 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
898 }
899 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
901 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
903 Cost +=
904 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
906 if (IsSigned) {
907 Type *CondTy = RetTy->getWithNewBitWidth(1);
908 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
910 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
912 }
913 return LT.first * Cost;
914 }
915 case Intrinsic::fshl:
916 case Intrinsic::fshr: {
917 if (ICA.getArgs().empty())
918 break;
919
920 // TODO: Add handling for fshl where third argument is not a constant.
921 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
922 if (!OpInfoZ.isConstant())
923 break;
924
925 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
926 if (OpInfoZ.isUniform()) {
927 static const CostTblEntry FshlTbl[] = {
928 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
929 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
930 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
931 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
932 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
933 // to avoid having to duplicate the costs.
934 const auto *Entry =
935 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
936 if (Entry)
937 return LegalisationCost.first * Entry->Cost;
938 }
939
940 auto TyL = getTypeLegalizationCost(RetTy);
941 if (!RetTy->isIntegerTy())
942 break;
943
944 // Estimate cost manually, as types like i8 and i16 will get promoted to
945 // i32 and CostTableLookup will ignore the extra conversion cost.
946 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
947 RetTy->getScalarSizeInBits() < 64) ||
948 (RetTy->getScalarSizeInBits() % 64 != 0);
949 unsigned ExtraCost = HigherCost ? 1 : 0;
950 if (RetTy->getScalarSizeInBits() == 32 ||
951 RetTy->getScalarSizeInBits() == 64)
952 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
953 // extr instruction.
954 else if (HigherCost)
955 ExtraCost = 1;
956 else
957 break;
958 return TyL.first + ExtraCost;
959 }
960 case Intrinsic::get_active_lane_mask: {
961 auto RetTy = cast<VectorType>(ICA.getReturnType());
962 EVT RetVT = getTLI()->getValueType(DL, RetTy);
963 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
964 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
965 break;
966
967 if (RetTy->isScalableTy()) {
968 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
970 break;
971
972 auto LT = getTypeLegalizationCost(RetTy);
973 InstructionCost Cost = LT.first;
974 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
975 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
976 // nxv32i1 = get_active_lane_mask(base, idx) ->
977 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
978 if (ST->hasSVE2p1() || ST->hasSME2()) {
979 Cost /= 2;
980 if (Cost == 1)
981 return Cost;
982 }
983
984 // If more than one whilelo intrinsic is required, include the extra cost
985 // required by the saturating add & select required to increment the
986 // start value after the first intrinsic call.
987 Type *OpTy = ICA.getArgTypes()[0];
988 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
989 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
990 Type *CondTy = OpTy->getWithNewBitWidth(1);
991 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
993 return Cost + (SplitCost * (Cost - 1));
994 } else if (!getTLI()->isTypeLegal(RetVT)) {
995 // We don't have enough context at this point to determine if the mask
996 // is going to be kept live after the block, which will force the vXi1
997 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
998 // For now, we just assume the vectorizer created this intrinsic and
999 // the result will be the input for a PHI. In this case the cost will
1000 // be extremely high for fixed-width vectors.
1001 // NOTE: getScalarizationOverhead returns a cost that's far too
1002 // pessimistic for the actual generated codegen. In reality there are
1003 // two instructions generated per lane.
1004 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1005 }
1006 break;
1007 }
1008 case Intrinsic::experimental_vector_match: {
1009 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1010 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1011 unsigned SearchSize = NeedleTy->getNumElements();
1012 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1013 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1014 // Neoverse V3, these are cheap operations with the same latency as a
1015 // vector ADD. In most cases, however, we also need to do an extra DUP.
1016 // For fixed-length vectors we currently need an extra five--six
1017 // instructions besides the MATCH.
1019 if (isa<FixedVectorType>(RetTy))
1020 Cost += 10;
1021 return Cost;
1022 }
1023 break;
1024 }
1025 case Intrinsic::experimental_cttz_elts: {
1026 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1027 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1028 // This will consist of a SVE brkb and a cntp instruction. These
1029 // typically have the same latency and half the throughput as a vector
1030 // add instruction.
1031 return 4;
1032 }
1033 break;
1034 }
1035 default:
1036 break;
1037 }
1039}
1040
1041/// The function will remove redundant reinterprets casting in the presence
1042/// of the control flow
1043static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1044 IntrinsicInst &II) {
1046 auto RequiredType = II.getType();
1047
1048 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1049 assert(PN && "Expected Phi Node!");
1050
1051 // Don't create a new Phi unless we can remove the old one.
1052 if (!PN->hasOneUse())
1053 return std::nullopt;
1054
1055 for (Value *IncValPhi : PN->incoming_values()) {
1056 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1057 if (!Reinterpret ||
1058 Reinterpret->getIntrinsicID() !=
1059 Intrinsic::aarch64_sve_convert_to_svbool ||
1060 RequiredType != Reinterpret->getArgOperand(0)->getType())
1061 return std::nullopt;
1062 }
1063
1064 // Create the new Phi
1065 IC.Builder.SetInsertPoint(PN);
1066 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1067 Worklist.push_back(PN);
1068
1069 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1070 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1071 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1072 Worklist.push_back(Reinterpret);
1073 }
1074
1075 // Cleanup Phi Node and reinterprets
1076 return IC.replaceInstUsesWith(II, NPN);
1077}
1078
1079// A collection of properties common to SVE intrinsics that allow for combines
1080// to be written without needing to know the specific intrinsic.
1082 //
1083 // Helper routines for common intrinsic definitions.
1084 //
1085
1086 // e.g. llvm.aarch64.sve.add pg, op1, op2
1087 // with IID ==> llvm.aarch64.sve.add_u
1088 static SVEIntrinsicInfo
1095
1096 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1103
1104 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1110
1111 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1117
1118 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1119 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1120 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1121 return SVEIntrinsicInfo()
1124 }
1125
1126 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1127 // llvm.aarch64.sve.ld1 pg, ptr
1134
1135 // All properties relate to predication and thus having a general predicate
1136 // is the minimum requirement to say there is intrinsic info to act on.
1137 explicit operator bool() const { return hasGoverningPredicate(); }
1138
1139 //
1140 // Properties relating to the governing predicate.
1141 //
1142
1144 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1145 }
1146
1148 assert(hasGoverningPredicate() && "Propery not set!");
1149 return GoverningPredicateIdx;
1150 }
1151
1153 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1154 GoverningPredicateIdx = Index;
1155 return *this;
1156 }
1157
1158 //
1159 // Properties relating to operations the intrinsic could be transformed into.
1160 // NOTE: This does not mean such a transformation is always possible, but the
1161 // knowledge makes it possible to reuse existing optimisations without needing
1162 // to embed specific handling for each intrinsic. For example, instruction
1163 // simplification can be used to optimise an intrinsic's active lanes.
1164 //
1165
1167 return UndefIntrinsic != Intrinsic::not_intrinsic;
1168 }
1169
1171 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1172 return UndefIntrinsic;
1173 }
1174
1176 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1177 UndefIntrinsic = IID;
1178 return *this;
1179 }
1180
1181 bool hasMatchingIROpode() const { return IROpcode != 0; }
1182
1183 unsigned getMatchingIROpode() const {
1184 assert(hasMatchingIROpode() && "Propery not set!");
1185 return IROpcode;
1186 }
1187
1189 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1190 IROpcode = Opcode;
1191 return *this;
1192 }
1193
1194 //
1195 // Properties relating to the result of inactive lanes.
1196 //
1197
1199 return ResultLanes == InactiveLanesTakenFromOperand;
1200 }
1201
1203 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1204 return OperandIdxForInactiveLanes;
1205 }
1206
1208 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1209 ResultLanes = InactiveLanesTakenFromOperand;
1210 OperandIdxForInactiveLanes = Index;
1211 return *this;
1212 }
1213
1215 return ResultLanes == InactiveLanesAreNotDefined;
1216 }
1217
1219 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1220 ResultLanes = InactiveLanesAreNotDefined;
1221 return *this;
1222 }
1223
1225 return ResultLanes == InactiveLanesAreUnused;
1226 }
1227
1229 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1230 ResultLanes = InactiveLanesAreUnused;
1231 return *this;
1232 }
1233
1234 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1235 // inactiveLanesAreZeroed =
1236 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1237 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1238
1240 ResultIsZeroInitialized = true;
1241 return *this;
1242 }
1243
1244 //
1245 // The first operand of unary merging operations is typically only used to
1246 // set the result for inactive lanes. Knowing this allows us to deadcode the
1247 // operand when we can prove there are no inactive lanes.
1248 //
1249
1251 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1252 }
1253
1255 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1256 return OperandIdxWithNoActiveLanes;
1257 }
1258
1260 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1261 OperandIdxWithNoActiveLanes = Index;
1262 return *this;
1263 }
1264
1265private:
1266 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1267
1268 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1269 unsigned IROpcode = 0;
1270
1271 enum PredicationStyle {
1273 InactiveLanesTakenFromOperand,
1274 InactiveLanesAreNotDefined,
1275 InactiveLanesAreUnused
1276 } ResultLanes = Uninitialized;
1277
1278 bool ResultIsZeroInitialized = false;
1279 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1280 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1281};
1282
1284 // Some SVE intrinsics do not use scalable vector types, but since they are
1285 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1286 if (!isa<ScalableVectorType>(II.getType()) &&
1287 all_of(II.args(), [&](const Value *V) {
1288 return !isa<ScalableVectorType>(V->getType());
1289 }))
1290 return SVEIntrinsicInfo();
1291
1292 Intrinsic::ID IID = II.getIntrinsicID();
1293 switch (IID) {
1294 default:
1295 break;
1296 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1297 case Intrinsic::aarch64_sve_fcvt_f16f32:
1298 case Intrinsic::aarch64_sve_fcvt_f16f64:
1299 case Intrinsic::aarch64_sve_fcvt_f32f16:
1300 case Intrinsic::aarch64_sve_fcvt_f32f64:
1301 case Intrinsic::aarch64_sve_fcvt_f64f16:
1302 case Intrinsic::aarch64_sve_fcvt_f64f32:
1303 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1304 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1305 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1306 case Intrinsic::aarch64_sve_fcvtzs:
1307 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1308 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1309 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1310 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1311 case Intrinsic::aarch64_sve_fcvtzu:
1312 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1313 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1314 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1315 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1316 case Intrinsic::aarch64_sve_scvtf:
1317 case Intrinsic::aarch64_sve_scvtf_f16i32:
1318 case Intrinsic::aarch64_sve_scvtf_f16i64:
1319 case Intrinsic::aarch64_sve_scvtf_f32i64:
1320 case Intrinsic::aarch64_sve_scvtf_f64i32:
1321 case Intrinsic::aarch64_sve_ucvtf:
1322 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1323 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1324 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1325 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1327
1328 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1329 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1330 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1331 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1333
1334 case Intrinsic::aarch64_sve_fabd:
1335 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1336 case Intrinsic::aarch64_sve_fadd:
1337 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1338 .setMatchingIROpcode(Instruction::FAdd);
1339 case Intrinsic::aarch64_sve_fdiv:
1340 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1341 .setMatchingIROpcode(Instruction::FDiv);
1342 case Intrinsic::aarch64_sve_fmax:
1343 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1344 case Intrinsic::aarch64_sve_fmaxnm:
1345 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1346 case Intrinsic::aarch64_sve_fmin:
1347 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1348 case Intrinsic::aarch64_sve_fminnm:
1349 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1350 case Intrinsic::aarch64_sve_fmla:
1351 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1352 case Intrinsic::aarch64_sve_fmls:
1353 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1354 case Intrinsic::aarch64_sve_fmul:
1355 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1356 .setMatchingIROpcode(Instruction::FMul);
1357 case Intrinsic::aarch64_sve_fmulx:
1358 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1359 case Intrinsic::aarch64_sve_fnmla:
1360 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1361 case Intrinsic::aarch64_sve_fnmls:
1362 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1363 case Intrinsic::aarch64_sve_fsub:
1364 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1365 .setMatchingIROpcode(Instruction::FSub);
1366 case Intrinsic::aarch64_sve_add:
1367 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1368 .setMatchingIROpcode(Instruction::Add);
1369 case Intrinsic::aarch64_sve_mla:
1370 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1371 case Intrinsic::aarch64_sve_mls:
1372 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1373 case Intrinsic::aarch64_sve_mul:
1374 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1375 .setMatchingIROpcode(Instruction::Mul);
1376 case Intrinsic::aarch64_sve_sabd:
1377 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1378 case Intrinsic::aarch64_sve_sdiv:
1379 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1380 .setMatchingIROpcode(Instruction::SDiv);
1381 case Intrinsic::aarch64_sve_smax:
1382 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1383 case Intrinsic::aarch64_sve_smin:
1384 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1385 case Intrinsic::aarch64_sve_smulh:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1387 case Intrinsic::aarch64_sve_sub:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1389 .setMatchingIROpcode(Instruction::Sub);
1390 case Intrinsic::aarch64_sve_uabd:
1391 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1392 case Intrinsic::aarch64_sve_udiv:
1393 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1394 .setMatchingIROpcode(Instruction::UDiv);
1395 case Intrinsic::aarch64_sve_umax:
1396 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1397 case Intrinsic::aarch64_sve_umin:
1398 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1399 case Intrinsic::aarch64_sve_umulh:
1400 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1401 case Intrinsic::aarch64_sve_asr:
1402 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1403 .setMatchingIROpcode(Instruction::AShr);
1404 case Intrinsic::aarch64_sve_lsl:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1406 .setMatchingIROpcode(Instruction::Shl);
1407 case Intrinsic::aarch64_sve_lsr:
1408 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1409 .setMatchingIROpcode(Instruction::LShr);
1410 case Intrinsic::aarch64_sve_and:
1411 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1412 .setMatchingIROpcode(Instruction::And);
1413 case Intrinsic::aarch64_sve_bic:
1414 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1415 case Intrinsic::aarch64_sve_eor:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1417 .setMatchingIROpcode(Instruction::Xor);
1418 case Intrinsic::aarch64_sve_orr:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1420 .setMatchingIROpcode(Instruction::Or);
1421 case Intrinsic::aarch64_sve_sqsub:
1422 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1423 case Intrinsic::aarch64_sve_uqsub:
1424 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1425
1426 case Intrinsic::aarch64_sve_add_u:
1428 Instruction::Add);
1429 case Intrinsic::aarch64_sve_and_u:
1431 Instruction::And);
1432 case Intrinsic::aarch64_sve_asr_u:
1434 Instruction::AShr);
1435 case Intrinsic::aarch64_sve_eor_u:
1437 Instruction::Xor);
1438 case Intrinsic::aarch64_sve_fadd_u:
1440 Instruction::FAdd);
1441 case Intrinsic::aarch64_sve_fdiv_u:
1443 Instruction::FDiv);
1444 case Intrinsic::aarch64_sve_fmul_u:
1446 Instruction::FMul);
1447 case Intrinsic::aarch64_sve_fsub_u:
1449 Instruction::FSub);
1450 case Intrinsic::aarch64_sve_lsl_u:
1452 Instruction::Shl);
1453 case Intrinsic::aarch64_sve_lsr_u:
1455 Instruction::LShr);
1456 case Intrinsic::aarch64_sve_mul_u:
1458 Instruction::Mul);
1459 case Intrinsic::aarch64_sve_orr_u:
1461 Instruction::Or);
1462 case Intrinsic::aarch64_sve_sdiv_u:
1464 Instruction::SDiv);
1465 case Intrinsic::aarch64_sve_sub_u:
1467 Instruction::Sub);
1468 case Intrinsic::aarch64_sve_udiv_u:
1470 Instruction::UDiv);
1471
1472 case Intrinsic::aarch64_sve_addqv:
1473 case Intrinsic::aarch64_sve_and_z:
1474 case Intrinsic::aarch64_sve_bic_z:
1475 case Intrinsic::aarch64_sve_brka_z:
1476 case Intrinsic::aarch64_sve_brkb_z:
1477 case Intrinsic::aarch64_sve_brkn_z:
1478 case Intrinsic::aarch64_sve_brkpa_z:
1479 case Intrinsic::aarch64_sve_brkpb_z:
1480 case Intrinsic::aarch64_sve_cntp:
1481 case Intrinsic::aarch64_sve_compact:
1482 case Intrinsic::aarch64_sve_eor_z:
1483 case Intrinsic::aarch64_sve_eorv:
1484 case Intrinsic::aarch64_sve_eorqv:
1485 case Intrinsic::aarch64_sve_nand_z:
1486 case Intrinsic::aarch64_sve_nor_z:
1487 case Intrinsic::aarch64_sve_orn_z:
1488 case Intrinsic::aarch64_sve_orr_z:
1489 case Intrinsic::aarch64_sve_orv:
1490 case Intrinsic::aarch64_sve_orqv:
1491 case Intrinsic::aarch64_sve_pnext:
1492 case Intrinsic::aarch64_sve_rdffr_z:
1493 case Intrinsic::aarch64_sve_saddv:
1494 case Intrinsic::aarch64_sve_uaddv:
1495 case Intrinsic::aarch64_sve_umaxv:
1496 case Intrinsic::aarch64_sve_umaxqv:
1497 case Intrinsic::aarch64_sve_cmpeq:
1498 case Intrinsic::aarch64_sve_cmpeq_wide:
1499 case Intrinsic::aarch64_sve_cmpge:
1500 case Intrinsic::aarch64_sve_cmpge_wide:
1501 case Intrinsic::aarch64_sve_cmpgt:
1502 case Intrinsic::aarch64_sve_cmpgt_wide:
1503 case Intrinsic::aarch64_sve_cmphi:
1504 case Intrinsic::aarch64_sve_cmphi_wide:
1505 case Intrinsic::aarch64_sve_cmphs:
1506 case Intrinsic::aarch64_sve_cmphs_wide:
1507 case Intrinsic::aarch64_sve_cmple_wide:
1508 case Intrinsic::aarch64_sve_cmplo_wide:
1509 case Intrinsic::aarch64_sve_cmpls_wide:
1510 case Intrinsic::aarch64_sve_cmplt_wide:
1511 case Intrinsic::aarch64_sve_cmpne:
1512 case Intrinsic::aarch64_sve_cmpne_wide:
1513 case Intrinsic::aarch64_sve_facge:
1514 case Intrinsic::aarch64_sve_facgt:
1515 case Intrinsic::aarch64_sve_fcmpeq:
1516 case Intrinsic::aarch64_sve_fcmpge:
1517 case Intrinsic::aarch64_sve_fcmpgt:
1518 case Intrinsic::aarch64_sve_fcmpne:
1519 case Intrinsic::aarch64_sve_fcmpuo:
1520 case Intrinsic::aarch64_sve_ld1:
1521 case Intrinsic::aarch64_sve_ld1_gather:
1522 case Intrinsic::aarch64_sve_ld1_gather_index:
1523 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1524 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1525 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1526 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1527 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1528 case Intrinsic::aarch64_sve_ld1q_gather_index:
1529 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1530 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1531 case Intrinsic::aarch64_sve_ld1ro:
1532 case Intrinsic::aarch64_sve_ld1rq:
1533 case Intrinsic::aarch64_sve_ld1udq:
1534 case Intrinsic::aarch64_sve_ld1uwq:
1535 case Intrinsic::aarch64_sve_ld2_sret:
1536 case Intrinsic::aarch64_sve_ld2q_sret:
1537 case Intrinsic::aarch64_sve_ld3_sret:
1538 case Intrinsic::aarch64_sve_ld3q_sret:
1539 case Intrinsic::aarch64_sve_ld4_sret:
1540 case Intrinsic::aarch64_sve_ld4q_sret:
1541 case Intrinsic::aarch64_sve_ldff1:
1542 case Intrinsic::aarch64_sve_ldff1_gather:
1543 case Intrinsic::aarch64_sve_ldff1_gather_index:
1544 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1545 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1546 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1547 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1548 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1549 case Intrinsic::aarch64_sve_ldnf1:
1550 case Intrinsic::aarch64_sve_ldnt1:
1551 case Intrinsic::aarch64_sve_ldnt1_gather:
1552 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1553 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1554 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1556
1557 case Intrinsic::aarch64_sve_prf:
1558 case Intrinsic::aarch64_sve_prfb_gather_index:
1559 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1560 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1561 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1562 case Intrinsic::aarch64_sve_prfd_gather_index:
1563 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1564 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1565 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1566 case Intrinsic::aarch64_sve_prfh_gather_index:
1567 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1568 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1569 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1570 case Intrinsic::aarch64_sve_prfw_gather_index:
1571 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1572 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1573 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1575
1576 case Intrinsic::aarch64_sve_st1_scatter:
1577 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1578 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1579 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1580 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1581 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1582 case Intrinsic::aarch64_sve_st1dq:
1583 case Intrinsic::aarch64_sve_st1q_scatter_index:
1584 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1585 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1586 case Intrinsic::aarch64_sve_st1wq:
1587 case Intrinsic::aarch64_sve_stnt1:
1588 case Intrinsic::aarch64_sve_stnt1_scatter:
1589 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1590 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1591 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1593 case Intrinsic::aarch64_sve_st2:
1594 case Intrinsic::aarch64_sve_st2q:
1596 case Intrinsic::aarch64_sve_st3:
1597 case Intrinsic::aarch64_sve_st3q:
1599 case Intrinsic::aarch64_sve_st4:
1600 case Intrinsic::aarch64_sve_st4q:
1602 }
1603
1604 return SVEIntrinsicInfo();
1605}
1606
1607static bool isAllActivePredicate(Value *Pred) {
1608 Value *UncastedPred;
1609
1610 // Look through predicate casts that only remove lanes.
1612 m_Value(UncastedPred)))) {
1613 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1614 Pred = UncastedPred;
1615
1617 m_Value(UncastedPred))))
1618 // If the predicate has the same or less lanes than the uncasted predicate
1619 // then we know the casting has no effect.
1620 if (OrigPredTy->getMinNumElements() <=
1621 cast<ScalableVectorType>(UncastedPred->getType())
1622 ->getMinNumElements())
1623 Pred = UncastedPred;
1624 }
1625
1626 auto *C = dyn_cast<Constant>(Pred);
1627 return C && C->isAllOnesValue();
1628}
1629
1630// Simplify `V` by only considering the operations that affect active lanes.
1631// This function should only return existing Values or newly created Constants.
1632static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1633 auto *Dup = dyn_cast<IntrinsicInst>(V);
1634 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1635 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1637 cast<VectorType>(V->getType())->getElementCount(),
1638 cast<Constant>(Dup->getOperand(2)));
1639
1640 return V;
1641}
1642
1643static std::optional<Instruction *>
1645 const SVEIntrinsicInfo &IInfo) {
1646 const unsigned Opc = IInfo.getMatchingIROpode();
1647 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1648
1649 Value *Pg = II.getOperand(0);
1650 Value *Op1 = II.getOperand(1);
1651 Value *Op2 = II.getOperand(2);
1652 const DataLayout &DL = II.getDataLayout();
1653
1654 // Canonicalise constants to the RHS.
1656 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1657 IC.replaceOperand(II, 1, Op2);
1658 IC.replaceOperand(II, 2, Op1);
1659 return &II;
1660 }
1661
1662 // Only active lanes matter when simplifying the operation.
1663 Op1 = stripInactiveLanes(Op1, Pg);
1664 Op2 = stripInactiveLanes(Op2, Pg);
1665
1666 Value *SimpleII;
1667 if (auto FII = dyn_cast<FPMathOperator>(&II))
1668 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1669 else
1670 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1671
1672 // An SVE intrinsic's result is always defined. However, this is not the case
1673 // for its equivalent IR instruction (e.g. when shifting by an amount more
1674 // than the data's bitwidth). Simplifications to an undefined result must be
1675 // ignored to preserve the intrinsic's expected behaviour.
1676 if (!SimpleII || isa<UndefValue>(SimpleII))
1677 return std::nullopt;
1678
1679 if (IInfo.inactiveLanesAreNotDefined())
1680 return IC.replaceInstUsesWith(II, SimpleII);
1681
1682 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1683
1684 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1685 if (SimpleII == Inactive)
1686 return IC.replaceInstUsesWith(II, SimpleII);
1687
1688 // Inactive lanes must be preserved.
1689 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1690 return IC.replaceInstUsesWith(II, SimpleII);
1691}
1692
1693// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1694// to operations with less strict inactive lane requirements.
1695static std::optional<Instruction *>
1697 const SVEIntrinsicInfo &IInfo) {
1698 if (!IInfo.hasGoverningPredicate())
1699 return std::nullopt;
1700
1701 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1702
1703 // If there are no active lanes.
1704 if (match(OpPredicate, m_ZeroInt())) {
1706 return IC.replaceInstUsesWith(
1707 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1708
1709 if (IInfo.inactiveLanesAreUnused()) {
1710 if (IInfo.resultIsZeroInitialized())
1712
1713 return IC.eraseInstFromFunction(II);
1714 }
1715 }
1716
1717 // If there are no inactive lanes.
1718 if (isAllActivePredicate(OpPredicate)) {
1719 if (IInfo.hasOperandWithNoActiveLanes()) {
1720 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1721 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1722 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1723 }
1724
1725 if (IInfo.hasMatchingUndefIntrinsic()) {
1726 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1727 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1728 II.setCalledFunction(NewDecl);
1729 return &II;
1730 }
1731 }
1732
1733 // Operation specific simplifications.
1734 if (IInfo.hasMatchingIROpode() &&
1736 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1737
1738 return std::nullopt;
1739}
1740
1741// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1742// => (binop (pred) (from_svbool _) (from_svbool _))
1743//
1744// The above transformation eliminates a `to_svbool` in the predicate
1745// operand of bitwise operation `binop` by narrowing the vector width of
1746// the operation. For example, it would convert a `<vscale x 16 x i1>
1747// and` into a `<vscale x 4 x i1> and`. This is profitable because
1748// to_svbool must zero the new lanes during widening, whereas
1749// from_svbool is free.
1750static std::optional<Instruction *>
1752 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1753 if (!BinOp)
1754 return std::nullopt;
1755
1756 auto IntrinsicID = BinOp->getIntrinsicID();
1757 switch (IntrinsicID) {
1758 case Intrinsic::aarch64_sve_and_z:
1759 case Intrinsic::aarch64_sve_bic_z:
1760 case Intrinsic::aarch64_sve_eor_z:
1761 case Intrinsic::aarch64_sve_nand_z:
1762 case Intrinsic::aarch64_sve_nor_z:
1763 case Intrinsic::aarch64_sve_orn_z:
1764 case Intrinsic::aarch64_sve_orr_z:
1765 break;
1766 default:
1767 return std::nullopt;
1768 }
1769
1770 auto BinOpPred = BinOp->getOperand(0);
1771 auto BinOpOp1 = BinOp->getOperand(1);
1772 auto BinOpOp2 = BinOp->getOperand(2);
1773
1774 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1775 if (!PredIntr ||
1776 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1777 return std::nullopt;
1778
1779 auto PredOp = PredIntr->getOperand(0);
1780 auto PredOpTy = cast<VectorType>(PredOp->getType());
1781 if (PredOpTy != II.getType())
1782 return std::nullopt;
1783
1784 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1785 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1786 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1787 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1788 if (BinOpOp1 == BinOpOp2)
1789 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1790 else
1791 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1792 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1793
1794 auto NarrowedBinOp =
1795 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1796 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1797}
1798
1799static std::optional<Instruction *>
1801 // If the reinterpret instruction operand is a PHI Node
1802 if (isa<PHINode>(II.getArgOperand(0)))
1803 return processPhiNode(IC, II);
1804
1805 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1806 return BinOpCombine;
1807
1808 // Ignore converts to/from svcount_t.
1809 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1810 isa<TargetExtType>(II.getType()))
1811 return std::nullopt;
1812
1813 SmallVector<Instruction *, 32> CandidatesForRemoval;
1814 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1815
1816 const auto *IVTy = cast<VectorType>(II.getType());
1817
1818 // Walk the chain of conversions.
1819 while (Cursor) {
1820 // If the type of the cursor has fewer lanes than the final result, zeroing
1821 // must take place, which breaks the equivalence chain.
1822 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1823 if (CursorVTy->getElementCount().getKnownMinValue() <
1824 IVTy->getElementCount().getKnownMinValue())
1825 break;
1826
1827 // If the cursor has the same type as I, it is a viable replacement.
1828 if (Cursor->getType() == IVTy)
1829 EarliestReplacement = Cursor;
1830
1831 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1832
1833 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1834 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1835 Intrinsic::aarch64_sve_convert_to_svbool ||
1836 IntrinsicCursor->getIntrinsicID() ==
1837 Intrinsic::aarch64_sve_convert_from_svbool))
1838 break;
1839
1840 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1841 Cursor = IntrinsicCursor->getOperand(0);
1842 }
1843
1844 // If no viable replacement in the conversion chain was found, there is
1845 // nothing to do.
1846 if (!EarliestReplacement)
1847 return std::nullopt;
1848
1849 return IC.replaceInstUsesWith(II, EarliestReplacement);
1850}
1851
1852static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1853 IntrinsicInst &II) {
1854 // svsel(ptrue, x, y) => x
1855 auto *OpPredicate = II.getOperand(0);
1856 if (isAllActivePredicate(OpPredicate))
1857 return IC.replaceInstUsesWith(II, II.getOperand(1));
1858
1859 auto Select =
1860 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1861 return IC.replaceInstUsesWith(II, Select);
1862}
1863
1864static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1865 IntrinsicInst &II) {
1866 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1867 if (!Pg)
1868 return std::nullopt;
1869
1870 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1871 return std::nullopt;
1872
1873 const auto PTruePattern =
1874 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1875 if (PTruePattern != AArch64SVEPredPattern::vl1)
1876 return std::nullopt;
1877
1878 // The intrinsic is inserting into lane zero so use an insert instead.
1879 auto *IdxTy = Type::getInt64Ty(II.getContext());
1880 auto *Insert = InsertElementInst::Create(
1881 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1882 Insert->insertBefore(II.getIterator());
1883 Insert->takeName(&II);
1884
1885 return IC.replaceInstUsesWith(II, Insert);
1886}
1887
1888static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1889 IntrinsicInst &II) {
1890 // Replace DupX with a regular IR splat.
1891 auto *RetTy = cast<ScalableVectorType>(II.getType());
1892 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1893 II.getArgOperand(0));
1894 Splat->takeName(&II);
1895 return IC.replaceInstUsesWith(II, Splat);
1896}
1897
1898static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1899 IntrinsicInst &II) {
1900 LLVMContext &Ctx = II.getContext();
1901
1902 if (!isAllActivePredicate(II.getArgOperand(0)))
1903 return std::nullopt;
1904
1905 // Check that we have a compare of zero..
1906 auto *SplatValue =
1908 if (!SplatValue || !SplatValue->isZero())
1909 return std::nullopt;
1910
1911 // ..against a dupq
1912 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1913 if (!DupQLane ||
1914 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1915 return std::nullopt;
1916
1917 // Where the dupq is a lane 0 replicate of a vector insert
1918 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1919 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1920 return std::nullopt;
1921
1922 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1923 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1924 return std::nullopt;
1925
1926 // Where the vector insert is a fixed constant vector insert into undef at
1927 // index zero
1928 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1929 return std::nullopt;
1930
1931 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1932 return std::nullopt;
1933
1934 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1935 if (!ConstVec)
1936 return std::nullopt;
1937
1938 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1939 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1940 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1941 return std::nullopt;
1942
1943 unsigned NumElts = VecTy->getNumElements();
1944 unsigned PredicateBits = 0;
1945
1946 // Expand intrinsic operands to a 16-bit byte level predicate
1947 for (unsigned I = 0; I < NumElts; ++I) {
1948 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1949 if (!Arg)
1950 return std::nullopt;
1951 if (!Arg->isZero())
1952 PredicateBits |= 1 << (I * (16 / NumElts));
1953 }
1954
1955 // If all bits are zero bail early with an empty predicate
1956 if (PredicateBits == 0) {
1957 auto *PFalse = Constant::getNullValue(II.getType());
1958 PFalse->takeName(&II);
1959 return IC.replaceInstUsesWith(II, PFalse);
1960 }
1961
1962 // Calculate largest predicate type used (where byte predicate is largest)
1963 unsigned Mask = 8;
1964 for (unsigned I = 0; I < 16; ++I)
1965 if ((PredicateBits & (1 << I)) != 0)
1966 Mask |= (I % 8);
1967
1968 unsigned PredSize = Mask & -Mask;
1969 auto *PredType = ScalableVectorType::get(
1970 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1971
1972 // Ensure all relevant bits are set
1973 for (unsigned I = 0; I < 16; I += PredSize)
1974 if ((PredicateBits & (1 << I)) == 0)
1975 return std::nullopt;
1976
1977 auto *PTruePat =
1978 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1979 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1980 {PredType}, {PTruePat});
1981 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1982 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1983 auto *ConvertFromSVBool =
1984 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1985 {II.getType()}, {ConvertToSVBool});
1986
1987 ConvertFromSVBool->takeName(&II);
1988 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1989}
1990
1991static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1992 IntrinsicInst &II) {
1993 Value *Pg = II.getArgOperand(0);
1994 Value *Vec = II.getArgOperand(1);
1995 auto IntrinsicID = II.getIntrinsicID();
1996 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1997
1998 // lastX(splat(X)) --> X
1999 if (auto *SplatVal = getSplatValue(Vec))
2000 return IC.replaceInstUsesWith(II, SplatVal);
2001
2002 // If x and/or y is a splat value then:
2003 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2004 Value *LHS, *RHS;
2005 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2006 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2007 auto *OldBinOp = cast<BinaryOperator>(Vec);
2008 auto OpC = OldBinOp->getOpcode();
2009 auto *NewLHS =
2010 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2011 auto *NewRHS =
2012 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2014 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2015 return IC.replaceInstUsesWith(II, NewBinOp);
2016 }
2017 }
2018
2019 auto *C = dyn_cast<Constant>(Pg);
2020 if (IsAfter && C && C->isNullValue()) {
2021 // The intrinsic is extracting lane 0 so use an extract instead.
2022 auto *IdxTy = Type::getInt64Ty(II.getContext());
2023 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2024 Extract->insertBefore(II.getIterator());
2025 Extract->takeName(&II);
2026 return IC.replaceInstUsesWith(II, Extract);
2027 }
2028
2029 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2030 if (!IntrPG)
2031 return std::nullopt;
2032
2033 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2034 return std::nullopt;
2035
2036 const auto PTruePattern =
2037 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2038
2039 // Can the intrinsic's predicate be converted to a known constant index?
2040 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2041 if (!MinNumElts)
2042 return std::nullopt;
2043
2044 unsigned Idx = MinNumElts - 1;
2045 // Increment the index if extracting the element after the last active
2046 // predicate element.
2047 if (IsAfter)
2048 ++Idx;
2049
2050 // Ignore extracts whose index is larger than the known minimum vector
2051 // length. NOTE: This is an artificial constraint where we prefer to
2052 // maintain what the user asked for until an alternative is proven faster.
2053 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2054 if (Idx >= PgVTy->getMinNumElements())
2055 return std::nullopt;
2056
2057 // The intrinsic is extracting a fixed lane so use an extract instead.
2058 auto *IdxTy = Type::getInt64Ty(II.getContext());
2059 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2060 Extract->insertBefore(II.getIterator());
2061 Extract->takeName(&II);
2062 return IC.replaceInstUsesWith(II, Extract);
2063}
2064
2065static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2066 IntrinsicInst &II) {
2067 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2068 // integer variant across a variety of micro-architectures. Replace scalar
2069 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2070 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2071 // depending on the micro-architecture, but has been observed as generally
2072 // being faster, particularly when the CLAST[AB] op is a loop-carried
2073 // dependency.
2074 Value *Pg = II.getArgOperand(0);
2075 Value *Fallback = II.getArgOperand(1);
2076 Value *Vec = II.getArgOperand(2);
2077 Type *Ty = II.getType();
2078
2079 if (!Ty->isIntegerTy())
2080 return std::nullopt;
2081
2082 Type *FPTy;
2083 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2084 default:
2085 return std::nullopt;
2086 case 16:
2087 FPTy = IC.Builder.getHalfTy();
2088 break;
2089 case 32:
2090 FPTy = IC.Builder.getFloatTy();
2091 break;
2092 case 64:
2093 FPTy = IC.Builder.getDoubleTy();
2094 break;
2095 }
2096
2097 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2098 auto *FPVTy = VectorType::get(
2099 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2100 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2101 auto *FPII = IC.Builder.CreateIntrinsic(
2102 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2103 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2104 return IC.replaceInstUsesWith(II, FPIItoInt);
2105}
2106
2107static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2108 IntrinsicInst &II) {
2109 LLVMContext &Ctx = II.getContext();
2110 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2111 // can work with RDFFR_PP for ptest elimination.
2112 auto *AllPat =
2113 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2114 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2115 {II.getType()}, {AllPat});
2116 auto *RDFFR =
2117 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2118 RDFFR->takeName(&II);
2119 return IC.replaceInstUsesWith(II, RDFFR);
2120}
2121
2122static std::optional<Instruction *>
2124 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2125
2126 if (Pattern == AArch64SVEPredPattern::all) {
2128 II.getType(), ElementCount::getScalable(NumElts));
2129 Cnt->takeName(&II);
2130 return IC.replaceInstUsesWith(II, Cnt);
2131 }
2132
2133 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2134
2135 return MinNumElts && NumElts >= MinNumElts
2136 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2137 II, ConstantInt::get(II.getType(), MinNumElts)))
2138 : std::nullopt;
2139}
2140
2141static std::optional<Instruction *>
2143 const AArch64Subtarget *ST) {
2144 if (!ST->isStreaming())
2145 return std::nullopt;
2146
2147 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2148 // with SVEPredPattern::all
2149 Value *Cnt =
2151 Cnt->takeName(&II);
2152 return IC.replaceInstUsesWith(II, Cnt);
2153}
2154
2155static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2156 IntrinsicInst &II) {
2157 Value *PgVal = II.getArgOperand(0);
2158 Value *OpVal = II.getArgOperand(1);
2159
2160 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2161 // Later optimizations prefer this form.
2162 if (PgVal == OpVal &&
2163 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2164 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2165 Value *Ops[] = {PgVal, OpVal};
2166 Type *Tys[] = {PgVal->getType()};
2167
2168 auto *PTest =
2169 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2170 PTest->takeName(&II);
2171
2172 return IC.replaceInstUsesWith(II, PTest);
2173 }
2174
2177
2178 if (!Pg || !Op)
2179 return std::nullopt;
2180
2181 Intrinsic::ID OpIID = Op->getIntrinsicID();
2182
2183 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2184 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2185 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2186 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2187 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2188
2189 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2190
2191 PTest->takeName(&II);
2192 return IC.replaceInstUsesWith(II, PTest);
2193 }
2194
2195 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2196 // Later optimizations may rewrite sequence to use the flag-setting variant
2197 // of instruction X to remove PTEST.
2198 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2199 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2200 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2201 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2202 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2203 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2204 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2205 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2206 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2207 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2208 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2209 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2210 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2211 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2212 Type *Tys[] = {Pg->getType()};
2213
2214 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2215 PTest->takeName(&II);
2216
2217 return IC.replaceInstUsesWith(II, PTest);
2218 }
2219
2220 return std::nullopt;
2221}
2222
2223template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
2224static std::optional<Instruction *>
2226 bool MergeIntoAddendOp) {
2227 Value *P = II.getOperand(0);
2228 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2229 if (MergeIntoAddendOp) {
2230 AddendOp = II.getOperand(1);
2231 Mul = II.getOperand(2);
2232 } else {
2233 AddendOp = II.getOperand(2);
2234 Mul = II.getOperand(1);
2235 }
2236
2238 m_Value(MulOp1))))
2239 return std::nullopt;
2240
2241 if (!Mul->hasOneUse())
2242 return std::nullopt;
2243
2244 Instruction *FMFSource = nullptr;
2245 if (II.getType()->isFPOrFPVectorTy()) {
2246 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2247 // Stop the combine when the flags on the inputs differ in case dropping
2248 // flags would lead to us missing out on more beneficial optimizations.
2249 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2250 return std::nullopt;
2251 if (!FAddFlags.allowContract())
2252 return std::nullopt;
2253 FMFSource = &II;
2254 }
2255
2256 CallInst *Res;
2257 if (MergeIntoAddendOp)
2258 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2259 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2260 else
2261 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2262 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2263
2264 return IC.replaceInstUsesWith(II, Res);
2265}
2266
2267static std::optional<Instruction *>
2269 Value *Pred = II.getOperand(0);
2270 Value *PtrOp = II.getOperand(1);
2271 Type *VecTy = II.getType();
2272
2273 if (isAllActivePredicate(Pred)) {
2274 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2275 Load->copyMetadata(II);
2276 return IC.replaceInstUsesWith(II, Load);
2277 }
2278
2279 CallInst *MaskedLoad =
2280 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2281 Pred, ConstantAggregateZero::get(VecTy));
2282 MaskedLoad->copyMetadata(II);
2283 return IC.replaceInstUsesWith(II, MaskedLoad);
2284}
2285
2286static std::optional<Instruction *>
2288 Value *VecOp = II.getOperand(0);
2289 Value *Pred = II.getOperand(1);
2290 Value *PtrOp = II.getOperand(2);
2291
2292 if (isAllActivePredicate(Pred)) {
2293 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2294 Store->copyMetadata(II);
2295 return IC.eraseInstFromFunction(II);
2296 }
2297
2298 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2299 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2300 MaskedStore->copyMetadata(II);
2301 return IC.eraseInstFromFunction(II);
2302}
2303
2305 switch (Intrinsic) {
2306 case Intrinsic::aarch64_sve_fmul_u:
2307 return Instruction::BinaryOps::FMul;
2308 case Intrinsic::aarch64_sve_fadd_u:
2309 return Instruction::BinaryOps::FAdd;
2310 case Intrinsic::aarch64_sve_fsub_u:
2311 return Instruction::BinaryOps::FSub;
2312 default:
2313 return Instruction::BinaryOpsEnd;
2314 }
2315}
2316
2317static std::optional<Instruction *>
2319 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2320 if (II.isStrictFP())
2321 return std::nullopt;
2322
2323 auto *OpPredicate = II.getOperand(0);
2324 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2325 if (BinOpCode == Instruction::BinaryOpsEnd ||
2326 !isAllActivePredicate(OpPredicate))
2327 return std::nullopt;
2328 auto BinOp = IC.Builder.CreateBinOpFMF(
2329 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2330 return IC.replaceInstUsesWith(II, BinOp);
2331}
2332
2333static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2334 IntrinsicInst &II) {
2335 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2336 Intrinsic::aarch64_sve_mla>(
2337 IC, II, true))
2338 return MLA;
2339 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2340 Intrinsic::aarch64_sve_mad>(
2341 IC, II, false))
2342 return MAD;
2343 return std::nullopt;
2344}
2345
2346static std::optional<Instruction *>
2348 if (auto FMLA =
2349 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2350 Intrinsic::aarch64_sve_fmla>(IC, II,
2351 true))
2352 return FMLA;
2353 if (auto FMAD =
2354 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2355 Intrinsic::aarch64_sve_fmad>(IC, II,
2356 false))
2357 return FMAD;
2358 if (auto FMLA =
2359 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2360 Intrinsic::aarch64_sve_fmla>(IC, II,
2361 true))
2362 return FMLA;
2363 return std::nullopt;
2364}
2365
2366static std::optional<Instruction *>
2368 if (auto FMLA =
2369 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2370 Intrinsic::aarch64_sve_fmla>(IC, II,
2371 true))
2372 return FMLA;
2373 if (auto FMAD =
2374 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2375 Intrinsic::aarch64_sve_fmad>(IC, II,
2376 false))
2377 return FMAD;
2378 if (auto FMLA_U =
2379 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2380 Intrinsic::aarch64_sve_fmla_u>(
2381 IC, II, true))
2382 return FMLA_U;
2383 return instCombineSVEVectorBinOp(IC, II);
2384}
2385
2386static std::optional<Instruction *>
2388 if (auto FMLS =
2389 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2390 Intrinsic::aarch64_sve_fmls>(IC, II,
2391 true))
2392 return FMLS;
2393 if (auto FMSB =
2394 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2395 Intrinsic::aarch64_sve_fnmsb>(
2396 IC, II, false))
2397 return FMSB;
2398 if (auto FMLS =
2399 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2400 Intrinsic::aarch64_sve_fmls>(IC, II,
2401 true))
2402 return FMLS;
2403 return std::nullopt;
2404}
2405
2406static std::optional<Instruction *>
2408 if (auto FMLS =
2409 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2410 Intrinsic::aarch64_sve_fmls>(IC, II,
2411 true))
2412 return FMLS;
2413 if (auto FMSB =
2414 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2415 Intrinsic::aarch64_sve_fnmsb>(
2416 IC, II, false))
2417 return FMSB;
2418 if (auto FMLS_U =
2419 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2420 Intrinsic::aarch64_sve_fmls_u>(
2421 IC, II, true))
2422 return FMLS_U;
2423 return instCombineSVEVectorBinOp(IC, II);
2424}
2425
2426static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2427 IntrinsicInst &II) {
2428 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2429 Intrinsic::aarch64_sve_mls>(
2430 IC, II, true))
2431 return MLS;
2432 return std::nullopt;
2433}
2434
2435static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2436 IntrinsicInst &II) {
2437 Value *UnpackArg = II.getArgOperand(0);
2438 auto *RetTy = cast<ScalableVectorType>(II.getType());
2439 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2440 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2441
2442 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2443 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2444 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2445 ScalarArg =
2446 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2447 Value *NewVal =
2448 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2449 NewVal->takeName(&II);
2450 return IC.replaceInstUsesWith(II, NewVal);
2451 }
2452
2453 return std::nullopt;
2454}
2455static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2456 IntrinsicInst &II) {
2457 auto *OpVal = II.getOperand(0);
2458 auto *OpIndices = II.getOperand(1);
2459 VectorType *VTy = cast<VectorType>(II.getType());
2460
2461 // Check whether OpIndices is a constant splat value < minimal element count
2462 // of result.
2463 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2464 if (!SplatValue ||
2465 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2466 return std::nullopt;
2467
2468 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2469 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2470 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2471 auto *VectorSplat =
2472 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2473
2474 VectorSplat->takeName(&II);
2475 return IC.replaceInstUsesWith(II, VectorSplat);
2476}
2477
2478static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2479 IntrinsicInst &II) {
2480 Value *A, *B;
2481 Type *RetTy = II.getType();
2482 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2483 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2484
2485 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2486 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2487 if ((match(II.getArgOperand(0),
2489 match(II.getArgOperand(1),
2491 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2492 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2493 auto *TyA = cast<ScalableVectorType>(A->getType());
2494 if (TyA == B->getType() &&
2496 auto *SubVec = IC.Builder.CreateInsertVector(
2497 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2498 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2499 TyA->getMinNumElements());
2500 ConcatVec->takeName(&II);
2501 return IC.replaceInstUsesWith(II, ConcatVec);
2502 }
2503 }
2504
2505 return std::nullopt;
2506}
2507
2508static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2509 IntrinsicInst &II) {
2510 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2511 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2512 Value *A, *B;
2513 if (match(II.getArgOperand(0),
2516 m_Specific(A), m_Specific(B))))
2517 return IC.replaceInstUsesWith(
2518 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2519
2520 return std::nullopt;
2521}
2522
2523static std::optional<Instruction *>
2525 Value *Mask = II.getOperand(0);
2526 Value *BasePtr = II.getOperand(1);
2527 Value *Index = II.getOperand(2);
2528 Type *Ty = II.getType();
2529 Value *PassThru = ConstantAggregateZero::get(Ty);
2530
2531 // Contiguous gather => masked load.
2532 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2533 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2534 Value *IndexBase;
2536 m_Value(IndexBase), m_SpecificInt(1)))) {
2537 Align Alignment =
2538 BasePtr->getPointerAlignment(II.getDataLayout());
2539
2540 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2541 BasePtr, IndexBase);
2542 CallInst *MaskedLoad =
2543 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2544 MaskedLoad->takeName(&II);
2545 return IC.replaceInstUsesWith(II, MaskedLoad);
2546 }
2547
2548 return std::nullopt;
2549}
2550
2551static std::optional<Instruction *>
2553 Value *Val = II.getOperand(0);
2554 Value *Mask = II.getOperand(1);
2555 Value *BasePtr = II.getOperand(2);
2556 Value *Index = II.getOperand(3);
2557 Type *Ty = Val->getType();
2558
2559 // Contiguous scatter => masked store.
2560 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2561 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2562 Value *IndexBase;
2564 m_Value(IndexBase), m_SpecificInt(1)))) {
2565 Align Alignment =
2566 BasePtr->getPointerAlignment(II.getDataLayout());
2567
2568 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2569 BasePtr, IndexBase);
2570 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2571
2572 return IC.eraseInstFromFunction(II);
2573 }
2574
2575 return std::nullopt;
2576}
2577
2578static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2579 IntrinsicInst &II) {
2581 Value *Pred = II.getOperand(0);
2582 Value *Vec = II.getOperand(1);
2583 Value *DivVec = II.getOperand(2);
2584
2585 Value *SplatValue = getSplatValue(DivVec);
2586 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2587 if (!SplatConstantInt)
2588 return std::nullopt;
2589
2590 APInt Divisor = SplatConstantInt->getValue();
2591 const int64_t DivisorValue = Divisor.getSExtValue();
2592 if (DivisorValue == -1)
2593 return std::nullopt;
2594 if (DivisorValue == 1)
2595 IC.replaceInstUsesWith(II, Vec);
2596
2597 if (Divisor.isPowerOf2()) {
2598 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2599 auto ASRD = IC.Builder.CreateIntrinsic(
2600 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2601 return IC.replaceInstUsesWith(II, ASRD);
2602 }
2603 if (Divisor.isNegatedPowerOf2()) {
2604 Divisor.negate();
2605 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2606 auto ASRD = IC.Builder.CreateIntrinsic(
2607 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2608 auto NEG = IC.Builder.CreateIntrinsic(
2609 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2610 return IC.replaceInstUsesWith(II, NEG);
2611 }
2612
2613 return std::nullopt;
2614}
2615
2616bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2617 size_t VecSize = Vec.size();
2618 if (VecSize == 1)
2619 return true;
2620 if (!isPowerOf2_64(VecSize))
2621 return false;
2622 size_t HalfVecSize = VecSize / 2;
2623
2624 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2625 RHS != Vec.end(); LHS++, RHS++) {
2626 if (*LHS != nullptr && *RHS != nullptr) {
2627 if (*LHS == *RHS)
2628 continue;
2629 else
2630 return false;
2631 }
2632 if (!AllowPoison)
2633 return false;
2634 if (*LHS == nullptr && *RHS != nullptr)
2635 *LHS = *RHS;
2636 }
2637
2638 Vec.resize(HalfVecSize);
2639 SimplifyValuePattern(Vec, AllowPoison);
2640 return true;
2641}
2642
2643// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2644// to dupqlane(f64(C)) where C is A concatenated with B
2645static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2646 IntrinsicInst &II) {
2647 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2648 if (!match(II.getOperand(0),
2650 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2651 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2652 return std::nullopt;
2653 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2654
2655 // Insert the scalars into a container ordered by InsertElement index
2656 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2657 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2658 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2659 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2660 CurrentInsertElt = InsertElt->getOperand(0);
2661 }
2662
2663 bool AllowPoison =
2664 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2665 if (!SimplifyValuePattern(Elts, AllowPoison))
2666 return std::nullopt;
2667
2668 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2669 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2670 for (size_t I = 0; I < Elts.size(); I++) {
2671 if (Elts[I] == nullptr)
2672 continue;
2673 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2674 IC.Builder.getInt64(I));
2675 }
2676 if (InsertEltChain == nullptr)
2677 return std::nullopt;
2678
2679 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2680 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2681 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2682 // be narrowed back to the original type.
2683 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2684 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2685 IIScalableTy->getMinNumElements() /
2686 PatternWidth;
2687
2688 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2689 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2690 auto *WideShuffleMaskTy =
2691 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2692
2693 auto InsertSubvector = IC.Builder.CreateInsertVector(
2694 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2695 uint64_t(0));
2696 auto WideBitcast =
2697 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2698 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2699 auto WideShuffle = IC.Builder.CreateShuffleVector(
2700 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2701 auto NarrowBitcast =
2702 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2703
2704 return IC.replaceInstUsesWith(II, NarrowBitcast);
2705}
2706
2707static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2708 IntrinsicInst &II) {
2709 Value *A = II.getArgOperand(0);
2710 Value *B = II.getArgOperand(1);
2711 if (A == B)
2712 return IC.replaceInstUsesWith(II, A);
2713
2714 return std::nullopt;
2715}
2716
2717static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2718 IntrinsicInst &II) {
2719 Value *Pred = II.getOperand(0);
2720 Value *Vec = II.getOperand(1);
2721 Value *Shift = II.getOperand(2);
2722
2723 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2724 Value *AbsPred, *MergedValue;
2726 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2728 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2729
2730 return std::nullopt;
2731
2732 // Transform is valid if any of the following are true:
2733 // * The ABS merge value is an undef or non-negative
2734 // * The ABS predicate is all active
2735 // * The ABS predicate and the SRSHL predicates are the same
2736 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2737 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2738 return std::nullopt;
2739
2740 // Only valid when the shift amount is non-negative, otherwise the rounding
2741 // behaviour of SRSHL cannot be ignored.
2742 if (!match(Shift, m_NonNegative()))
2743 return std::nullopt;
2744
2745 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2746 {II.getType()}, {Pred, Vec, Shift});
2747
2748 return IC.replaceInstUsesWith(II, LSL);
2749}
2750
2751static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2752 IntrinsicInst &II) {
2753 Value *Vec = II.getOperand(0);
2754
2755 if (getSplatValue(Vec) == II.getOperand(1))
2756 return IC.replaceInstUsesWith(II, Vec);
2757
2758 return std::nullopt;
2759}
2760
2761static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2762 IntrinsicInst &II) {
2763 // If this barrier is post-dominated by identical one we can remove it
2764 auto *NI = II.getNextNode();
2765 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2766 auto CanSkipOver = [](Instruction *I) {
2767 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2768 };
2769 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2770 auto *NIBB = NI->getParent();
2771 NI = NI->getNextNode();
2772 if (!NI) {
2773 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2774 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2775 else
2776 break;
2777 }
2778 }
2779 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2780 if (NextII && II.isIdenticalTo(NextII))
2781 return IC.eraseInstFromFunction(II);
2782
2783 return std::nullopt;
2784}
2785
2786static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2787 IntrinsicInst &II) {
2788 return IC.replaceInstUsesWith(
2789 II,
2790 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2791 {II.getType(), II.getOperand(0)->getType()},
2792 {II.getOperand(0), II.getOperand(1)}));
2793}
2794
2795static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2796 IntrinsicInst &II) {
2798 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2799 return std::nullopt;
2800}
2801
2802static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2804 unsigned NumBits) {
2805 Value *Passthru = II.getOperand(0);
2806 Value *Pg = II.getOperand(1);
2807 Value *Op = II.getOperand(2);
2808
2809 // Convert UXT[BHW] to AND.
2810 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2811 auto *Ty = cast<VectorType>(II.getType());
2812 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2813 auto *Mask = ConstantInt::get(Ty, MaskValue);
2814 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2815 {Pg, Op, Mask});
2816 return IC.replaceInstUsesWith(II, And);
2817 }
2818
2819 return std::nullopt;
2820}
2821
2822static std::optional<Instruction *>
2824 SMEAttrs FnSMEAttrs(*II.getFunction());
2825 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2826 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2827 return IC.replaceInstUsesWith(
2828 II, ConstantInt::getBool(II.getType(), IsStreaming));
2829 return std::nullopt;
2830}
2831
2832std::optional<Instruction *>
2834 IntrinsicInst &II) const {
2836 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2837 return I;
2838
2839 Intrinsic::ID IID = II.getIntrinsicID();
2840 switch (IID) {
2841 default:
2842 break;
2843 case Intrinsic::aarch64_dmb:
2844 return instCombineDMB(IC, II);
2845 case Intrinsic::aarch64_neon_fmaxnm:
2846 case Intrinsic::aarch64_neon_fminnm:
2847 return instCombineMaxMinNM(IC, II);
2848 case Intrinsic::aarch64_sve_convert_from_svbool:
2849 return instCombineConvertFromSVBool(IC, II);
2850 case Intrinsic::aarch64_sve_dup:
2851 return instCombineSVEDup(IC, II);
2852 case Intrinsic::aarch64_sve_dup_x:
2853 return instCombineSVEDupX(IC, II);
2854 case Intrinsic::aarch64_sve_cmpne:
2855 case Intrinsic::aarch64_sve_cmpne_wide:
2856 return instCombineSVECmpNE(IC, II);
2857 case Intrinsic::aarch64_sve_rdffr:
2858 return instCombineRDFFR(IC, II);
2859 case Intrinsic::aarch64_sve_lasta:
2860 case Intrinsic::aarch64_sve_lastb:
2861 return instCombineSVELast(IC, II);
2862 case Intrinsic::aarch64_sve_clasta_n:
2863 case Intrinsic::aarch64_sve_clastb_n:
2864 return instCombineSVECondLast(IC, II);
2865 case Intrinsic::aarch64_sve_cntd:
2866 return instCombineSVECntElts(IC, II, 2);
2867 case Intrinsic::aarch64_sve_cntw:
2868 return instCombineSVECntElts(IC, II, 4);
2869 case Intrinsic::aarch64_sve_cnth:
2870 return instCombineSVECntElts(IC, II, 8);
2871 case Intrinsic::aarch64_sve_cntb:
2872 return instCombineSVECntElts(IC, II, 16);
2873 case Intrinsic::aarch64_sme_cntsd:
2874 return instCombineSMECntsd(IC, II, ST);
2875 case Intrinsic::aarch64_sve_ptest_any:
2876 case Intrinsic::aarch64_sve_ptest_first:
2877 case Intrinsic::aarch64_sve_ptest_last:
2878 return instCombineSVEPTest(IC, II);
2879 case Intrinsic::aarch64_sve_fadd:
2880 return instCombineSVEVectorFAdd(IC, II);
2881 case Intrinsic::aarch64_sve_fadd_u:
2882 return instCombineSVEVectorFAddU(IC, II);
2883 case Intrinsic::aarch64_sve_fmul_u:
2884 return instCombineSVEVectorBinOp(IC, II);
2885 case Intrinsic::aarch64_sve_fsub:
2886 return instCombineSVEVectorFSub(IC, II);
2887 case Intrinsic::aarch64_sve_fsub_u:
2888 return instCombineSVEVectorFSubU(IC, II);
2889 case Intrinsic::aarch64_sve_add:
2890 return instCombineSVEVectorAdd(IC, II);
2891 case Intrinsic::aarch64_sve_add_u:
2892 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2893 Intrinsic::aarch64_sve_mla_u>(
2894 IC, II, true);
2895 case Intrinsic::aarch64_sve_sub:
2896 return instCombineSVEVectorSub(IC, II);
2897 case Intrinsic::aarch64_sve_sub_u:
2898 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2899 Intrinsic::aarch64_sve_mls_u>(
2900 IC, II, true);
2901 case Intrinsic::aarch64_sve_tbl:
2902 return instCombineSVETBL(IC, II);
2903 case Intrinsic::aarch64_sve_uunpkhi:
2904 case Intrinsic::aarch64_sve_uunpklo:
2905 case Intrinsic::aarch64_sve_sunpkhi:
2906 case Intrinsic::aarch64_sve_sunpklo:
2907 return instCombineSVEUnpack(IC, II);
2908 case Intrinsic::aarch64_sve_uzp1:
2909 return instCombineSVEUzp1(IC, II);
2910 case Intrinsic::aarch64_sve_zip1:
2911 case Intrinsic::aarch64_sve_zip2:
2912 return instCombineSVEZip(IC, II);
2913 case Intrinsic::aarch64_sve_ld1_gather_index:
2914 return instCombineLD1GatherIndex(IC, II);
2915 case Intrinsic::aarch64_sve_st1_scatter_index:
2916 return instCombineST1ScatterIndex(IC, II);
2917 case Intrinsic::aarch64_sve_ld1:
2918 return instCombineSVELD1(IC, II, DL);
2919 case Intrinsic::aarch64_sve_st1:
2920 return instCombineSVEST1(IC, II, DL);
2921 case Intrinsic::aarch64_sve_sdiv:
2922 return instCombineSVESDIV(IC, II);
2923 case Intrinsic::aarch64_sve_sel:
2924 return instCombineSVESel(IC, II);
2925 case Intrinsic::aarch64_sve_srshl:
2926 return instCombineSVESrshl(IC, II);
2927 case Intrinsic::aarch64_sve_dupq_lane:
2928 return instCombineSVEDupqLane(IC, II);
2929 case Intrinsic::aarch64_sve_insr:
2930 return instCombineSVEInsr(IC, II);
2931 case Intrinsic::aarch64_sve_whilelo:
2932 return instCombineWhilelo(IC, II);
2933 case Intrinsic::aarch64_sve_ptrue:
2934 return instCombinePTrue(IC, II);
2935 case Intrinsic::aarch64_sve_uxtb:
2936 return instCombineSVEUxt(IC, II, 8);
2937 case Intrinsic::aarch64_sve_uxth:
2938 return instCombineSVEUxt(IC, II, 16);
2939 case Intrinsic::aarch64_sve_uxtw:
2940 return instCombineSVEUxt(IC, II, 32);
2941 case Intrinsic::aarch64_sme_in_streaming_mode:
2942 return instCombineInStreamingMode(IC, II);
2943 }
2944
2945 return std::nullopt;
2946}
2947
2949 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2950 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2951 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2952 SimplifyAndSetOp) const {
2953 switch (II.getIntrinsicID()) {
2954 default:
2955 break;
2956 case Intrinsic::aarch64_neon_fcvtxn:
2957 case Intrinsic::aarch64_neon_rshrn:
2958 case Intrinsic::aarch64_neon_sqrshrn:
2959 case Intrinsic::aarch64_neon_sqrshrun:
2960 case Intrinsic::aarch64_neon_sqshrn:
2961 case Intrinsic::aarch64_neon_sqshrun:
2962 case Intrinsic::aarch64_neon_sqxtn:
2963 case Intrinsic::aarch64_neon_sqxtun:
2964 case Intrinsic::aarch64_neon_uqrshrn:
2965 case Intrinsic::aarch64_neon_uqshrn:
2966 case Intrinsic::aarch64_neon_uqxtn:
2967 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2968 break;
2969 }
2970
2971 return std::nullopt;
2972}
2973
2975 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2977}
2978
2981 switch (K) {
2983 return TypeSize::getFixed(64);
2985 if (ST->useSVEForFixedLengthVectors() &&
2986 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
2987 return TypeSize::getFixed(
2988 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2989 else if (ST->isNeonAvailable())
2990 return TypeSize::getFixed(128);
2991 else
2992 return TypeSize::getFixed(0);
2994 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2996 return TypeSize::getScalable(128);
2997 else
2998 return TypeSize::getScalable(0);
2999 }
3000 llvm_unreachable("Unsupported register kind");
3001}
3002
3003bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
3005 Type *SrcOverrideTy) const {
3006 // A helper that returns a vector type from the given type. The number of
3007 // elements in type Ty determines the vector width.
3008 auto toVectorTy = [&](Type *ArgTy) {
3009 return VectorType::get(ArgTy->getScalarType(),
3010 cast<VectorType>(DstTy)->getElementCount());
3011 };
3012
3013 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3014 // i32, i64]. SVE doesn't generally have the same set of instructions to
3015 // perform an extend with the add/sub/mul. There are SMULLB style
3016 // instructions, but they operate on top/bottom, requiring some sort of lane
3017 // interleaving to be used with zext/sext.
3018 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3019 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3020 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3021 return false;
3022
3023 // Determine if the operation has a widening variant. We consider both the
3024 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
3025 // instructions.
3026 //
3027 // TODO: Add additional widening operations (e.g., shl, etc.) once we
3028 // verify that their extending operands are eliminated during code
3029 // generation.
3030 Type *SrcTy = SrcOverrideTy;
3031 switch (Opcode) {
3032 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
3033 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
3034 // The second operand needs to be an extend
3035 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3036 if (!SrcTy)
3037 SrcTy =
3038 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3039 } else
3040 return false;
3041 break;
3042 case Instruction::Mul: { // SMULL(2), UMULL(2)
3043 // Both operands need to be extends of the same type.
3044 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3045 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3046 if (!SrcTy)
3047 SrcTy =
3048 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3049 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
3050 // If one of the operands is a Zext and the other has enough zero bits to
3051 // be treated as unsigned, we can still general a umull, meaning the zext
3052 // is free.
3053 KnownBits Known =
3054 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3055 if (Args[0]->getType()->getScalarSizeInBits() -
3056 Known.Zero.countLeadingOnes() >
3057 DstTy->getScalarSizeInBits() / 2)
3058 return false;
3059 if (!SrcTy)
3060 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
3061 DstTy->getScalarSizeInBits() / 2));
3062 } else
3063 return false;
3064 break;
3065 }
3066 default:
3067 return false;
3068 }
3069
3070 // Legalize the destination type and ensure it can be used in a widening
3071 // operation.
3072 auto DstTyL = getTypeLegalizationCost(DstTy);
3073 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3074 return false;
3075
3076 // Legalize the source type and ensure it can be used in a widening
3077 // operation.
3078 assert(SrcTy && "Expected some SrcTy");
3079 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3080 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3081 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3082 return false;
3083
3084 // Get the total number of vector elements in the legalized types.
3085 InstructionCost NumDstEls =
3086 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3087 InstructionCost NumSrcEls =
3088 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3089
3090 // Return true if the legalized types have the same number of vector elements
3091 // and the destination element type size is twice that of the source type.
3092 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3093}
3094
3095// s/urhadd instructions implement the following pattern, making the
3096// extends free:
3097// %x = add ((zext i8 -> i16), 1)
3098// %y = (zext i8 -> i16)
3099// trunc i16 (lshr (add %x, %y), 1) -> i8
3100//
3102 Type *Src) const {
3103 // The source should be a legal vector type.
3104 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3105 (Src->isScalableTy() && !ST->hasSVE2()))
3106 return false;
3107
3108 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3109 return false;
3110
3111 // Look for trunc/shl/add before trying to match the pattern.
3112 const Instruction *Add = ExtUser;
3113 auto *AddUser =
3114 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3115 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3116 Add = AddUser;
3117
3118 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3119 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3120 return false;
3121
3122 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3123 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3124 Src->getScalarSizeInBits() !=
3125 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3126 return false;
3127
3128 // Try to match the whole pattern. Ext could be either the first or second
3129 // m_ZExtOrSExt matched.
3130 Instruction *Ex1, *Ex2;
3131 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3132 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3133 return false;
3134
3135 // Ensure both extends are of the same type
3136 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3137 Ex1->getOpcode() == Ex2->getOpcode())
3138 return true;
3139
3140 return false;
3141}
3142
3144 Type *Src,
3147 const Instruction *I) const {
3148 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3149 assert(ISD && "Invalid opcode");
3150 // If the cast is observable, and it is used by a widening instruction (e.g.,
3151 // uaddl, saddw, etc.), it may be free.
3152 if (I && I->hasOneUser()) {
3153 auto *SingleUser = cast<Instruction>(*I->user_begin());
3154 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3155 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
3156 // For adds only count the second operand as free if both operands are
3157 // extends but not the same operation. (i.e both operands are not free in
3158 // add(sext, zext)).
3159 if (SingleUser->getOpcode() == Instruction::Add) {
3160 if (I == SingleUser->getOperand(1) ||
3161 (isa<CastInst>(SingleUser->getOperand(1)) &&
3162 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3163 return 0;
3164 } else // Others are free so long as isWideningInstruction returned true.
3165 return 0;
3166 }
3167
3168 // The cast will be free for the s/urhadd instructions
3169 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3170 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3171 return 0;
3172 }
3173
3174 // TODO: Allow non-throughput costs that aren't binary.
3175 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3177 return Cost == 0 ? 0 : 1;
3178 return Cost;
3179 };
3180
3181 EVT SrcTy = TLI->getValueType(DL, Src);
3182 EVT DstTy = TLI->getValueType(DL, Dst);
3183
3184 if (!SrcTy.isSimple() || !DstTy.isSimple())
3185 return AdjustCost(
3186 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3187
3188 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3189 // we use fcvtx under SVE2. Give them invalid costs.
3190 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3191 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3192 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3194
3195 static const TypeConversionCostTblEntry BF16Tbl[] = {
3196 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3197 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3198 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3199 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3200 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3201 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3202 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3203 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3204 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3205 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3206 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3207 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3208 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3209 };
3210
3211 if (ST->hasBF16())
3212 if (const auto *Entry = ConvertCostTableLookup(
3213 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3214 return AdjustCost(Entry->Cost);
3215
3216 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3217 // The cost of unpacking twice is artificially increased for now in order
3218 // to avoid regressions against NEON, which will use tbl instructions directly
3219 // instead of multiple layers of [s|u]unpk[lo|hi].
3220 // We use the unpacks in cases where the destination type is illegal and
3221 // requires splitting of the input, even if the input type itself is legal.
3222 const unsigned int SVE_EXT_COST = 1;
3223 const unsigned int SVE_FCVT_COST = 1;
3224 const unsigned int SVE_UNPACK_ONCE = 4;
3225 const unsigned int SVE_UNPACK_TWICE = 16;
3226
3227 static const TypeConversionCostTblEntry ConversionTbl[] = {
3228 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3229 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3230 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3231 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3232 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3233 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3234 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3235 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3236 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3237 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3238 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3239 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3240 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3241 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3242 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3243 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3244 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3245 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3246 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3247 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3248
3249 // Truncations on nxvmiN
3250 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3251 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3252 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3253 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3254 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3255 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3256 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3257 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3258 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3259 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3260 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3261 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3262 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3263 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3264 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3265 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3266 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3267 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3268 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3269 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3270 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3271 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3272 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3273 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3274 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3275 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3276 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3277 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3278 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3279 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3280 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3281 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3282 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3283
3284 // The number of shll instructions for the extension.
3285 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3286 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3287 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3288 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3289 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3290 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3291 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3292 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3293 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3294 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3295 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3296 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3297 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3298 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3299 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3300 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3301
3302 // FP Ext and trunc
3303 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3304 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3305 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3306 // FP16
3307 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3308 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3309 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3310 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3311 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3312 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3313 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3314 // BF16 (uses shift)
3315 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3316 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3317 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3318 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3319 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3320 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3321 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3322 // FP Ext and trunc
3323 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3324 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3325 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3326 // FP16
3327 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3328 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3329 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3330 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3331 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3332 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3333 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3334 // BF16 (more complex, with +bf16 is handled above)
3335 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3336 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3337 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3338 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3339 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3340 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3341 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3342 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3343
3344 // LowerVectorINT_TO_FP:
3345 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3346 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3347 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3348 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3349 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3350 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3351
3352 // SVE: to nxv2f16
3353 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3354 SVE_EXT_COST + SVE_FCVT_COST},
3355 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3356 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3357 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3358 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3359 SVE_EXT_COST + SVE_FCVT_COST},
3360 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3361 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3362 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3363
3364 // SVE: to nxv4f16
3365 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3366 SVE_EXT_COST + SVE_FCVT_COST},
3367 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3368 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3369 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3370 SVE_EXT_COST + SVE_FCVT_COST},
3371 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3372 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3373
3374 // SVE: to nxv8f16
3375 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3376 SVE_EXT_COST + SVE_FCVT_COST},
3377 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3378 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3379 SVE_EXT_COST + SVE_FCVT_COST},
3380 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3381
3382 // SVE: to nxv16f16
3383 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3384 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3385 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3386 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3387
3388 // Complex: to v2f32
3389 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3390 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3391 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3392 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3393
3394 // SVE: to nxv2f32
3395 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3396 SVE_EXT_COST + SVE_FCVT_COST},
3397 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3398 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3399 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3400 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3401 SVE_EXT_COST + SVE_FCVT_COST},
3402 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3403 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3404 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3405
3406 // Complex: to v4f32
3407 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3408 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3409 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3410 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3411
3412 // SVE: to nxv4f32
3413 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3414 SVE_EXT_COST + SVE_FCVT_COST},
3415 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3416 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3417 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3418 SVE_EXT_COST + SVE_FCVT_COST},
3419 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3420 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3421
3422 // Complex: to v8f32
3423 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3424 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3425 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3426 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3427
3428 // SVE: to nxv8f32
3429 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3430 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3431 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3432 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3433 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3434 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3435 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3436 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3437
3438 // SVE: to nxv16f32
3439 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3440 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3441 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3442 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3443
3444 // Complex: to v16f32
3445 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3446 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3447
3448 // Complex: to v2f64
3449 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3450 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3451 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3452 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3453 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3454 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3455
3456 // SVE: to nxv2f64
3457 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3458 SVE_EXT_COST + SVE_FCVT_COST},
3459 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3460 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3461 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3462 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3463 SVE_EXT_COST + SVE_FCVT_COST},
3464 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3465 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3466 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3467
3468 // Complex: to v4f64
3469 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3470 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3471
3472 // SVE: to nxv4f64
3473 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3474 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3475 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3476 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3477 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3478 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3479 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3480 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3481 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3482 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3483 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3484 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3485
3486 // SVE: to nxv8f64
3487 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3488 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3489 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3490 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3491 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3492 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3493 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3494 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3495
3496 // LowerVectorFP_TO_INT
3497 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3498 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3499 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3500 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3501 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3502 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3503
3504 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3505 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3506 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3507 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3508 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3509 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3510 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3511
3512 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3513 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3514 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3515 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3516 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3517
3518 // Complex, from nxv2f32.
3519 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3520 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3521 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3522 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3523 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3524 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3525 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3526 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3527
3528 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3529 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3530 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3531 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3532 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3533 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3534 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3535
3536 // Complex, from nxv2f64.
3537 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3538 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3539 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3540 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3541 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3542 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3543 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3544 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3545 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3546 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3547
3548 // Complex, from nxv4f32.
3549 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3550 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3551 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3552 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3553 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3554 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3555 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3556 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3557 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3558 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3559
3560 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3561 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3562 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3563 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3564 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3565
3566 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3567 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3568 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3569 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3570 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3571 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3572 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3573
3574 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3575 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3576 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3577 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3578 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3579
3580 // Complex, from nxv8f16.
3581 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3582 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3583 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3584 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3585 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3586 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3587 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3588 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3589 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3590 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3591
3592 // Complex, from nxv4f16.
3593 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3594 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3595 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3596 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3597 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3598 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3599 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3600 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3601
3602 // Complex, from nxv2f16.
3603 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3604 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3605 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3606 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3607 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3608 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3609 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3610 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3611
3612 // Truncate from nxvmf32 to nxvmf16.
3613 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3614 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3615 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3616
3617 // Truncate from nxvmf32 to nxvmbf16.
3618 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3619 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3620 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3621
3622 // Truncate from nxvmf64 to nxvmf16.
3623 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3624 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3625 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3626
3627 // Truncate from nxvmf64 to nxvmbf16.
3628 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3629 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3630 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3631
3632 // Truncate from nxvmf64 to nxvmf32.
3633 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3634 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3635 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3636
3637 // Extend from nxvmf16 to nxvmf32.
3638 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3639 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3640 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3641
3642 // Extend from nxvmbf16 to nxvmf32.
3643 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3644 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3645 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3646
3647 // Extend from nxvmf16 to nxvmf64.
3648 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3649 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3650 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3651
3652 // Extend from nxvmbf16 to nxvmf64.
3653 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3654 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3655 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3656
3657 // Extend from nxvmf32 to nxvmf64.
3658 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3659 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3660 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3661
3662 // Bitcasts from float to integer
3663 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3664 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3665 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3666
3667 // Bitcasts from integer to float
3668 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3669 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3670 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3671
3672 // Add cost for extending to illegal -too wide- scalable vectors.
3673 // zero/sign extend are implemented by multiple unpack operations,
3674 // where each operation has a cost of 1.
3675 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3676 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3677 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3678 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3679 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3680 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3681
3682 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3683 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3684 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3685 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3686 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3687 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3688 };
3689
3690 // We have to estimate a cost of fixed length operation upon
3691 // SVE registers(operations) with the number of registers required
3692 // for a fixed type to be represented upon SVE registers.
3693 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3694 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3695 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3696 ST->useSVEForFixedLengthVectors(WiderTy)) {
3697 std::pair<InstructionCost, MVT> LT =
3698 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3699 unsigned NumElements =
3700 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3701 return AdjustCost(
3702 LT.first *
3704 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3705 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3706 CostKind, I));
3707 }
3708
3709 if (const auto *Entry = ConvertCostTableLookup(
3710 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3711 return AdjustCost(Entry->Cost);
3712
3713 static const TypeConversionCostTblEntry FP16Tbl[] = {
3714 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3715 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3716 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3717 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3718 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3719 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3720 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3721 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3722 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3723 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3724 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3725 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3726 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3727 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3728 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3729 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3730 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3731 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3732 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3733 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3734 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3735 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3736 };
3737
3738 if (ST->hasFullFP16())
3739 if (const auto *Entry = ConvertCostTableLookup(
3740 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3741 return AdjustCost(Entry->Cost);
3742
3743 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3744 // double-rounding issues.
3745 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3746 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3748 return AdjustCost(
3750 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3751 CCH, CostKind) +
3753 CostKind) +
3755 CostKind));
3756
3757 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3759 ST->isSVEorStreamingSVEAvailable() &&
3760 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3762 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3764 // The standard behaviour in the backend for these cases is to split the
3765 // extend up into two parts:
3766 // 1. Perform an extending load or masked load up to the legal type.
3767 // 2. Extend the loaded data to the final type.
3768 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3769 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3771 Opcode, LegalTy, Src, CCH, CostKind, I);
3773 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3774 return Part1 + Part2;
3775 }
3776
3777 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3778 // but we also want to include the TTI::CastContextHint::Masked case too.
3779 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3781 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3783
3784 return AdjustCost(
3785 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3786}
3787
3790 VectorType *VecTy, unsigned Index,
3792
3793 // Make sure we were given a valid extend opcode.
3794 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3795 "Invalid opcode");
3796
3797 // We are extending an element we extract from a vector, so the source type
3798 // of the extend is the element type of the vector.
3799 auto *Src = VecTy->getElementType();
3800
3801 // Sign- and zero-extends are for integer types only.
3802 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3803
3804 // Get the cost for the extract. We compute the cost (if any) for the extend
3805 // below.
3806 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3807 CostKind, Index, nullptr, nullptr);
3808
3809 // Legalize the types.
3810 auto VecLT = getTypeLegalizationCost(VecTy);
3811 auto DstVT = TLI->getValueType(DL, Dst);
3812 auto SrcVT = TLI->getValueType(DL, Src);
3813
3814 // If the resulting type is still a vector and the destination type is legal,
3815 // we may get the extension for free. If not, get the default cost for the
3816 // extend.
3817 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3818 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3819 CostKind);
3820
3821 // The destination type should be larger than the element type. If not, get
3822 // the default cost for the extend.
3823 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3824 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3825 CostKind);
3826
3827 switch (Opcode) {
3828 default:
3829 llvm_unreachable("Opcode should be either SExt or ZExt");
3830
3831 // For sign-extends, we only need a smov, which performs the extension
3832 // automatically.
3833 case Instruction::SExt:
3834 return Cost;
3835
3836 // For zero-extends, the extend is performed automatically by a umov unless
3837 // the destination type is i64 and the element type is i8 or i16.
3838 case Instruction::ZExt:
3839 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3840 return Cost;
3841 }
3842
3843 // If we are unable to perform the extend for free, get the default cost.
3844 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3845 CostKind);
3846}
3847
3850 const Instruction *I) const {
3852 return Opcode == Instruction::PHI ? 0 : 1;
3853 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3854 // Branches are assumed to be predicted.
3855 return 0;
3856}
3857
3858InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3859 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3860 const Instruction *I, Value *Scalar,
3861 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3862 assert(Val->isVectorTy() && "This must be a vector type");
3863
3864 if (Index != -1U) {
3865 // Legalize the type.
3866 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3867
3868 // This type is legalized to a scalar type.
3869 if (!LT.second.isVector())
3870 return 0;
3871
3872 // The type may be split. For fixed-width vectors we can normalize the
3873 // index to the new type.
3874 if (LT.second.isFixedLengthVector()) {
3875 unsigned Width = LT.second.getVectorNumElements();
3876 Index = Index % Width;
3877 }
3878
3879 // The element at index zero is already inside the vector.
3880 // - For a insert-element or extract-element
3881 // instruction that extracts integers, an explicit FPR -> GPR move is
3882 // needed. So it has non-zero cost.
3883 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3884 return 0;
3885
3886 // This is recognising a LD1 single-element structure to one lane of one
3887 // register instruction. I.e., if this is an `insertelement` instruction,
3888 // and its second operand is a load, then we will generate a LD1, which
3889 // are expensive instructions.
3890 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3891 return CostKind == TTI::TCK_CodeSize
3892 ? 0
3894
3895 // i1 inserts and extract will include an extra cset or cmp of the vector
3896 // value. Increase the cost by 1 to account.
3897 if (Val->getScalarSizeInBits() == 1)
3898 return CostKind == TTI::TCK_CodeSize
3899 ? 2
3901
3902 // FIXME:
3903 // If the extract-element and insert-element instructions could be
3904 // simplified away (e.g., could be combined into users by looking at use-def
3905 // context), they have no cost. This is not done in the first place for
3906 // compile-time considerations.
3907 }
3908
3909 // In case of Neon, if there exists extractelement from lane != 0 such that
3910 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3911 // 2. extractelement result feeds into fmul.
3912 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3913 // equivalent to 0.
3914 // then the extractelement can be merged with fmul in the backend and it
3915 // incurs no cost.
3916 // e.g.
3917 // define double @foo(<2 x double> %a) {
3918 // %1 = extractelement <2 x double> %a, i32 0
3919 // %2 = extractelement <2 x double> %a, i32 1
3920 // %res = fmul double %1, %2
3921 // ret double %res
3922 // }
3923 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3924 auto ExtractCanFuseWithFmul = [&]() {
3925 // We bail out if the extract is from lane 0.
3926 if (Index == 0)
3927 return false;
3928
3929 // Check if the scalar element type of the vector operand of ExtractElement
3930 // instruction is one of the allowed types.
3931 auto IsAllowedScalarTy = [&](const Type *T) {
3932 return T->isFloatTy() || T->isDoubleTy() ||
3933 (T->isHalfTy() && ST->hasFullFP16());
3934 };
3935
3936 // Check if the extractelement user is scalar fmul.
3937 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3938 // Check if the user is scalar fmul.
3939 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3940 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3941 !BO->getType()->isVectorTy();
3942 };
3943
3944 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3945 // certain scalar type and a certain vector register width.
3946 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3947 auto RegWidth =
3949 .getFixedValue();
3950 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3951 };
3952
3953 // Check if the type constraints on input vector type and result scalar type
3954 // of extractelement instruction are satisfied.
3955 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3956 return false;
3957
3958 if (Scalar) {
3959 DenseMap<User *, unsigned> UserToExtractIdx;
3960 for (auto *U : Scalar->users()) {
3961 if (!IsUserFMulScalarTy(U))
3962 return false;
3963 // Recording entry for the user is important. Index value is not
3964 // important.
3965 UserToExtractIdx[U];
3966 }
3967 if (UserToExtractIdx.empty())
3968 return false;
3969 for (auto &[S, U, L] : ScalarUserAndIdx) {
3970 for (auto *U : S->users()) {
3971 if (UserToExtractIdx.contains(U)) {
3972 auto *FMul = cast<BinaryOperator>(U);
3973 auto *Op0 = FMul->getOperand(0);
3974 auto *Op1 = FMul->getOperand(1);
3975 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3976 UserToExtractIdx[U] = L;
3977 break;
3978 }
3979 }
3980 }
3981 }
3982 for (auto &[U, L] : UserToExtractIdx) {
3983 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3984 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3985 return false;
3986 }
3987 } else {
3988 const auto *EE = cast<ExtractElementInst>(I);
3989
3990 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3991 if (!IdxOp)
3992 return false;
3993
3994 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3995 if (!IsUserFMulScalarTy(U))
3996 return false;
3997
3998 // Check if the other operand of extractelement is also extractelement
3999 // from lane equivalent to 0.
4000 const auto *BO = cast<BinaryOperator>(U);
4001 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4002 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4003 if (OtherEE) {
4004 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4005 if (!IdxOp)
4006 return false;
4007 return IsExtractLaneEquivalentToZero(
4008 cast<ConstantInt>(OtherEE->getIndexOperand())
4009 ->getValue()
4010 .getZExtValue(),
4011 OtherEE->getType()->getScalarSizeInBits());
4012 }
4013 return true;
4014 });
4015 }
4016 return true;
4017 };
4018
4019 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4020 ExtractCanFuseWithFmul())
4021 return 0;
4022
4023 // All other insert/extracts cost this much.
4024 return CostKind == TTI::TCK_CodeSize ? 1
4025 : ST->getVectorInsertExtractBaseCost();
4026}
4027
4030 unsigned Index,
4031 const Value *Op0,
4032 const Value *Op1) const {
4033 // Treat insert at lane 0 into a poison vector as having zero cost. This
4034 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4035 // single dup) are treated as cheap.
4036 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4037 isa<PoisonValue>(Op0))
4038 return 0;
4039 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4040}
4041
4043 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4044 Value *Scalar,
4045 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4046 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4047 ScalarUserAndIdx);
4048}
4049
4051 Type *Val,
4053 unsigned Index) const {
4054 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4055}
4056
4060 unsigned Index) const {
4061 if (isa<FixedVectorType>(Val))
4063 Index);
4064
4065 // This typically requires both while and lastb instructions in order
4066 // to extract the last element. If this is in a loop the while
4067 // instruction can at least be hoisted out, although it will consume a
4068 // predicate register. The cost should be more expensive than the base
4069 // extract cost, which is 2 for most CPUs.
4070 return CostKind == TTI::TCK_CodeSize
4071 ? 2
4072 : ST->getVectorInsertExtractBaseCost() + 1;
4073}
4074
4076 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4077 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4078 ArrayRef<Value *> VL) const {
4081 if (Ty->getElementType()->isFloatingPointTy())
4082 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4083 CostKind);
4084 unsigned VecInstCost =
4085 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4086 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4087}
4088
4089std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4091 TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
4092 std::function<InstructionCost(Type *)> InstCost) const {
4093 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4094 return std::nullopt;
4095 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4096 return std::nullopt;
4097
4098 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4099 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4101 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4102 Cost *= 2;
4103 Cost += InstCost(PromotedTy);
4104 if (IncludeTrunc)
4105 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4107 return Cost;
4108}
4109
4111 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4113 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4114
4115 // The code-generator is currently not able to handle scalable vectors
4116 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4117 // it. This change will be removed when code-generation for these types is
4118 // sufficiently reliable.
4119 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4120 if (VTy->getElementCount() == ElementCount::getScalable(1))
4122
4123 // TODO: Handle more cost kinds.
4125 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4126 Op2Info, Args, CxtI);
4127
4128 // Legalize the type.
4129 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4130 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4131
4132 // Increase the cost for half and bfloat types if not architecturally
4133 // supported.
4134 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4135 ISD == ISD::FDIV || ISD == ISD::FREM)
4136 if (auto PromotedCost = getFP16BF16PromoteCost(
4137 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4138 [&](Type *PromotedTy) {
4139 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4140 Op1Info, Op2Info);
4141 }))
4142 return *PromotedCost;
4143
4144 switch (ISD) {
4145 default:
4146 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4147 Op2Info);
4148 case ISD::SREM:
4149 case ISD::SDIV:
4150 /*
4151 Notes for sdiv/srem specific costs:
4152 1. This only considers the cases where the divisor is constant, uniform and
4153 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4154 result in some form of (ldr + adrp), corresponding to constant vectors, or
4155 scalarization of the division operation.
4156 2. Constant divisors, either negative in whole or partially, don't result in
4157 significantly different codegen as compared to positive constant divisors.
4158 So, we don't consider negative divisors separately.
4159 3. If the codegen is significantly different with SVE, it has been indicated
4160 using comments at appropriate places.
4161
4162 sdiv specific cases:
4163 -----------------------------------------------------------------------
4164 codegen | pow-of-2 | Type
4165 -----------------------------------------------------------------------
4166 add + cmp + csel + asr | Y | i64
4167 add + cmp + csel + asr | Y | i32
4168 -----------------------------------------------------------------------
4169
4170 srem specific cases:
4171 -----------------------------------------------------------------------
4172 codegen | pow-of-2 | Type
4173 -----------------------------------------------------------------------
4174 negs + and + and + csneg | Y | i64
4175 negs + and + and + csneg | Y | i32
4176 -----------------------------------------------------------------------
4177
4178 other sdiv/srem cases:
4179 -------------------------------------------------------------------------
4180 common codegen | + srem | + sdiv | pow-of-2 | Type
4181 -------------------------------------------------------------------------
4182 smulh + asr + add + add | - | - | N | i64
4183 smull + lsr + add + add | - | - | N | i32
4184 usra | and + sub | sshr | Y | <2 x i64>
4185 2 * (scalar code) | - | - | N | <2 x i64>
4186 usra | bic + sub | sshr + neg | Y | <4 x i32>
4187 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4188 + sshr + usra | | | |
4189 -------------------------------------------------------------------------
4190 */
4191 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4192 InstructionCost AddCost =
4193 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4194 Op1Info.getNoProps(), Op2Info.getNoProps());
4195 InstructionCost AsrCost =
4196 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4197 Op1Info.getNoProps(), Op2Info.getNoProps());
4198 InstructionCost MulCost =
4199 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4200 Op1Info.getNoProps(), Op2Info.getNoProps());
4201 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4202 // have similar cost.
4203 auto VT = TLI->getValueType(DL, Ty);
4204 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4205 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4206 // Neg can be folded into the asr instruction.
4207 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4208 : (3 * AsrCost + AddCost);
4209 } else {
4210 return MulCost + AsrCost + 2 * AddCost;
4211 }
4212 } else if (VT.isVector()) {
4213 InstructionCost UsraCost = 2 * AsrCost;
4214 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4215 // Division with scalable types corresponds to native 'asrd'
4216 // instruction when SVE is available.
4217 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4218
4219 // One more for the negation in SDIV
4221 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4222 if (Ty->isScalableTy() && ST->hasSVE())
4223 Cost += 2 * AsrCost;
4224 else {
4225 Cost +=
4226 UsraCost +
4227 (ISD == ISD::SDIV
4228 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4229 : 2 * AddCost);
4230 }
4231 return Cost;
4232 } else if (LT.second == MVT::v2i64) {
4233 return VT.getVectorNumElements() *
4234 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4235 Op1Info.getNoProps(),
4236 Op2Info.getNoProps());
4237 } else {
4238 // When SVE is available, we get:
4239 // smulh + lsr + add/sub + asr + add/sub.
4240 if (Ty->isScalableTy() && ST->hasSVE())
4241 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4242 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4243 }
4244 }
4245 }
4246 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4247 LT.second.isFixedLengthVector()) {
4248 // FIXME: When the constant vector is non-uniform, this may result in
4249 // loading the vector from constant pool or in some cases, may also result
4250 // in scalarization. For now, we are approximating this with the
4251 // scalarization cost.
4252 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4253 CostKind, -1, nullptr, nullptr);
4254 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4255 CostKind, -1, nullptr, nullptr);
4256 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4257 return ExtractCost + InsertCost +
4258 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4259 CostKind, Op1Info.getNoProps(),
4260 Op2Info.getNoProps());
4261 }
4262 [[fallthrough]];
4263 case ISD::UDIV:
4264 case ISD::UREM: {
4265 auto VT = TLI->getValueType(DL, Ty);
4266 if (Op2Info.isConstant()) {
4267 // If the operand is a power of 2 we can use the shift or and cost.
4268 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4269 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4270 Op1Info.getNoProps(),
4271 Op2Info.getNoProps());
4272 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4273 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4274 Op1Info.getNoProps(),
4275 Op2Info.getNoProps());
4276
4277 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4278 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4279 // The MULHU will be expanded to UMULL for the types not listed below,
4280 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4281 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4282 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4283 LT.second == MVT::nxv16i8;
4284 bool Is128bit = LT.second.is128BitVector();
4285
4286 InstructionCost MulCost =
4287 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4288 Op1Info.getNoProps(), Op2Info.getNoProps());
4289 InstructionCost AddCost =
4290 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4291 Op1Info.getNoProps(), Op2Info.getNoProps());
4292 InstructionCost ShrCost =
4293 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4294 Op1Info.getNoProps(), Op2Info.getNoProps());
4295 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4296 (HasMULH ? 0 : ShrCost) + // UMULL shift
4297 AddCost * 2 + ShrCost;
4298 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4299 }
4300 }
4301
4302 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4303 // emitted by the backend even when those functions are not declared in the
4304 // module.
4305 if (!VT.isVector() && VT.getSizeInBits() > 64)
4306 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4307
4309 Opcode, Ty, CostKind, Op1Info, Op2Info);
4310 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4311 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4312 // SDIV/UDIV operations are lowered using SVE, then we can have less
4313 // costs.
4314 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4315 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4316 static const CostTblEntry DivTbl[]{
4317 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4318 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4319 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4320 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4321 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4322 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4323
4324 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4325 if (nullptr != Entry)
4326 return Entry->Cost;
4327 }
4328 // For 8/16-bit elements, the cost is higher because the type
4329 // requires promotion and possibly splitting:
4330 if (LT.second.getScalarType() == MVT::i8)
4331 Cost *= 8;
4332 else if (LT.second.getScalarType() == MVT::i16)
4333 Cost *= 4;
4334 return Cost;
4335 } else {
4336 // If one of the operands is a uniform constant then the cost for each
4337 // element is Cost for insertion, extraction and division.
4338 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4339 // operation with scalar type
4340 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4341 (Op2Info.isConstant() && Op2Info.isUniform())) {
4342 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4344 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4345 return (4 + DivCost) * VTy->getNumElements();
4346 }
4347 }
4348 // On AArch64, without SVE, vector divisions are expanded
4349 // into scalar divisions of each pair of elements.
4350 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4351 -1, nullptr, nullptr);
4352 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4353 nullptr, nullptr);
4354 }
4355
4356 // TODO: if one of the arguments is scalar, then it's not necessary to
4357 // double the cost of handling the vector elements.
4358 Cost += Cost;
4359 }
4360 return Cost;
4361 }
4362 case ISD::MUL:
4363 // When SVE is available, then we can lower the v2i64 operation using
4364 // the SVE mul instruction, which has a lower cost.
4365 if (LT.second == MVT::v2i64 && ST->hasSVE())
4366 return LT.first;
4367
4368 // When SVE is not available, there is no MUL.2d instruction,
4369 // which means mul <2 x i64> is expensive as elements are extracted
4370 // from the vectors and the muls scalarized.
4371 // As getScalarizationOverhead is a bit too pessimistic, we
4372 // estimate the cost for a i64 vector directly here, which is:
4373 // - four 2-cost i64 extracts,
4374 // - two 2-cost i64 inserts, and
4375 // - two 1-cost muls.
4376 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4377 // LT.first = 2 the cost is 28. If both operands are extensions it will not
4378 // need to scalarize so the cost can be cheaper (smull or umull).
4379 // so the cost can be cheaper (smull or umull).
4380 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
4381 return LT.first;
4382 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4383 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4384 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4385 nullptr, nullptr) *
4386 2 +
4387 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4388 nullptr, nullptr));
4389 case ISD::ADD:
4390 case ISD::XOR:
4391 case ISD::OR:
4392 case ISD::AND:
4393 case ISD::SRL:
4394 case ISD::SRA:
4395 case ISD::SHL:
4396 // These nodes are marked as 'custom' for combining purposes only.
4397 // We know that they are legal. See LowerAdd in ISelLowering.
4398 return LT.first;
4399
4400 case ISD::FNEG:
4401 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4402 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4403 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4404 CxtI &&
4405 ((CxtI->hasOneUse() &&
4406 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4407 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4408 return 0;
4409 [[fallthrough]];
4410 case ISD::FADD:
4411 case ISD::FSUB:
4412 if (!Ty->getScalarType()->isFP128Ty())
4413 return LT.first;
4414 [[fallthrough]];
4415 case ISD::FMUL:
4416 case ISD::FDIV:
4417 // These nodes are marked as 'custom' just to lower them to SVE.
4418 // We know said lowering will incur no additional cost.
4419 if (!Ty->getScalarType()->isFP128Ty())
4420 return 2 * LT.first;
4421
4422 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4423 Op2Info);
4424 case ISD::FREM:
4425 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4426 // those functions are not declared in the module.
4427 if (!Ty->isVectorTy())
4428 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4429 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4430 Op2Info);
4431 }
4432}
4433
4436 const SCEV *Ptr,
4438 // Address computations in vectorized code with non-consecutive addresses will
4439 // likely result in more instructions compared to scalar code where the
4440 // computation can more often be merged into the index mode. The resulting
4441 // extra micro-ops can significantly decrease throughput.
4442 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4443 int MaxMergeDistance = 64;
4444
4445 if (PtrTy->isVectorTy() && SE &&
4446 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4447 return NumVectorInstToHideOverhead;
4448
4449 // In many cases the address computation is not merged into the instruction
4450 // addressing mode.
4451 return 1;
4452}
4453
4454/// Check whether Opcode1 has less throughput according to the scheduling
4455/// model than Opcode2.
4457 unsigned Opcode1, unsigned Opcode2) const {
4458 const MCSchedModel &Sched = ST->getSchedModel();
4459 const TargetInstrInfo *TII = ST->getInstrInfo();
4460 if (!Sched.hasInstrSchedModel())
4461 return false;
4462
4463 const MCSchedClassDesc *SCD1 =
4464 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4465 const MCSchedClassDesc *SCD2 =
4466 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4467 // We cannot handle variant scheduling classes without an MI. If we need to
4468 // support them for any of the instructions we query the information of we
4469 // might need to add a way to resolve them without a MI or not use the
4470 // scheduling info.
4471 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4472 "Cannot handle variant scheduling classes without an MI");
4473 if (!SCD1->isValid() || !SCD2->isValid())
4474 return false;
4475
4476 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4478}
4479
4481 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4483 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4484 // We don't lower some vector selects well that are wider than the register
4485 // width. TODO: Improve this with different cost kinds.
4486 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4487 // We would need this many instructions to hide the scalarization happening.
4488 const int AmortizationCost = 20;
4489
4490 // If VecPred is not set, check if we can get a predicate from the context
4491 // instruction, if its type matches the requested ValTy.
4492 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4493 CmpPredicate CurrentPred;
4494 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4495 m_Value())))
4496 VecPred = CurrentPred;
4497 }
4498 // Check if we have a compare/select chain that can be lowered using
4499 // a (F)CMxx & BFI pair.
4500 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4501 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4502 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4503 VecPred == CmpInst::FCMP_UNE) {
4504 static const auto ValidMinMaxTys = {
4505 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4506 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4507 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4508
4509 auto LT = getTypeLegalizationCost(ValTy);
4510 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4511 (ST->hasFullFP16() &&
4512 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4513 return LT.first;
4514 }
4515
4516 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4517 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4518 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4519 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4520 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4521 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4522 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4523 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4524 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4525 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4526 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4527 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4528
4529 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4530 EVT SelValTy = TLI->getValueType(DL, ValTy);
4531 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4532 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4533 SelCondTy.getSimpleVT(),
4534 SelValTy.getSimpleVT()))
4535 return Entry->Cost;
4536 }
4537 }
4538
4539 if (Opcode == Instruction::FCmp) {
4540 if (auto PromotedCost = getFP16BF16PromoteCost(
4541 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4542 [&](Type *PromotedTy) {
4544 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4545 CostKind, Op1Info, Op2Info);
4546 if (isa<VectorType>(PromotedTy))
4548 Instruction::Trunc,
4552 return Cost;
4553 }))
4554 return *PromotedCost;
4555
4556 auto LT = getTypeLegalizationCost(ValTy);
4557 // Model unknown fp compares as a libcall.
4558 if (LT.second.getScalarType() != MVT::f64 &&
4559 LT.second.getScalarType() != MVT::f32 &&
4560 LT.second.getScalarType() != MVT::f16)
4561 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4562 {ValTy, ValTy}, CostKind);
4563
4564 // Some comparison operators require expanding to multiple compares + or.
4565 unsigned Factor = 1;
4566 if (!CondTy->isVectorTy() &&
4567 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4568 Factor = 2; // fcmp with 2 selects
4569 else if (isa<FixedVectorType>(ValTy) &&
4570 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4571 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4572 Factor = 3; // fcmxx+fcmyy+or
4573 else if (isa<ScalableVectorType>(ValTy) &&
4574 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4575 Factor = 3; // fcmxx+fcmyy+or
4576
4577 if (isa<ScalableVectorType>(ValTy) &&
4579 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4580 AArch64::FCMEQv4f32))
4581 Factor *= 2;
4582
4583 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4584 }
4585
4586 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4587 // icmp(and, 0) as free, as we can make use of ands, but only if the
4588 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4589 // providing it will not cause performance regressions.
4590 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4591 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4592 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4593 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4594 if (match(I->getOperand(1), m_Zero()))
4595 return 0;
4596
4597 // x >= 1 / x < 1 -> x > 0 / x <= 0
4598 if (match(I->getOperand(1), m_One()) &&
4599 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4600 return 0;
4601
4602 // x <= -1 / x > -1 -> x > 0 / x <= 0
4603 if (match(I->getOperand(1), m_AllOnes()) &&
4604 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4605 return 0;
4606 }
4607
4608 // The base case handles scalable vectors fine for now, since it treats the
4609 // cost as 1 * legalization cost.
4610 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4611 Op1Info, Op2Info, I);
4612}
4613
4615AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4617 if (ST->requiresStrictAlign()) {
4618 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4619 // a bunch of instructions when strict align is enabled.
4620 return Options;
4621 }
4622 Options.AllowOverlappingLoads = true;
4623 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4624 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4625 // TODO: Though vector loads usually perform well on AArch64, in some targets
4626 // they may wake up the FP unit, which raises the power consumption. Perhaps
4627 // they could be used with no holds barred (-O3).
4628 Options.LoadSizes = {8, 4, 2, 1};
4629 Options.AllowedTailExpansions = {3, 5, 6};
4630 return Options;
4631}
4632
4634 return ST->hasSVE();
4635}
4636
4639 Align Alignment, unsigned AddressSpace,
4641 if (useNeonVector(Src))
4642 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
4643 CostKind);
4644 auto LT = getTypeLegalizationCost(Src);
4645 if (!LT.first.isValid())
4647
4648 // Return an invalid cost for element types that we are unable to lower.
4649 auto *VT = cast<VectorType>(Src);
4650 if (VT->getElementType()->isIntegerTy(1))
4652
4653 // The code-generator is currently not able to handle scalable vectors
4654 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4655 // it. This change will be removed when code-generation for these types is
4656 // sufficiently reliable.
4657 if (VT->getElementCount() == ElementCount::getScalable(1))
4659
4660 return LT.first;
4661}
4662
4663// This function returns gather/scatter overhead either from
4664// user-provided value or specialized values per-target from \p ST.
4665static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4666 const AArch64Subtarget *ST) {
4667 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4668 "Should be called on only load or stores.");
4669 switch (Opcode) {
4670 case Instruction::Load:
4671 if (SVEGatherOverhead.getNumOccurrences() > 0)
4672 return SVEGatherOverhead;
4673 return ST->getGatherOverhead();
4674 break;
4675 case Instruction::Store:
4676 if (SVEScatterOverhead.getNumOccurrences() > 0)
4677 return SVEScatterOverhead;
4678 return ST->getScatterOverhead();
4679 break;
4680 default:
4681 llvm_unreachable("Shouldn't have reached here");
4682 }
4683}
4684
4686 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4687 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4688 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4689 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4690 Alignment, CostKind, I);
4691 auto *VT = cast<VectorType>(DataTy);
4692 auto LT = getTypeLegalizationCost(DataTy);
4693 if (!LT.first.isValid())
4695
4696 // Return an invalid cost for element types that we are unable to lower.
4697 if (!LT.second.isVector() ||
4698 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4699 VT->getElementType()->isIntegerTy(1))
4701
4702 // The code-generator is currently not able to handle scalable vectors
4703 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4704 // it. This change will be removed when code-generation for these types is
4705 // sufficiently reliable.
4706 if (VT->getElementCount() == ElementCount::getScalable(1))
4708
4709 ElementCount LegalVF = LT.second.getVectorElementCount();
4710 InstructionCost MemOpCost =
4711 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4712 {TTI::OK_AnyValue, TTI::OP_None}, I);
4713 // Add on an overhead cost for using gathers/scatters.
4714 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4715 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4716}
4717
4719 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4720}
4721
4723 Align Alignment,
4724 unsigned AddressSpace,
4726 TTI::OperandValueInfo OpInfo,
4727 const Instruction *I) const {
4728 EVT VT = TLI->getValueType(DL, Ty, true);
4729 // Type legalization can't handle structs
4730 if (VT == MVT::Other)
4731 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4732 CostKind);
4733
4734 auto LT = getTypeLegalizationCost(Ty);
4735 if (!LT.first.isValid())
4737
4738 // The code-generator is currently not able to handle scalable vectors
4739 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4740 // it. This change will be removed when code-generation for these types is
4741 // sufficiently reliable.
4742 // We also only support full register predicate loads and stores.
4743 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4744 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4745 (VTy->getElementType()->isIntegerTy(1) &&
4746 !VTy->getElementCount().isKnownMultipleOf(
4749
4750 // TODO: consider latency as well for TCK_SizeAndLatency.
4752 return LT.first;
4753
4755 return 1;
4756
4757 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4758 LT.second.is128BitVector() && Alignment < Align(16)) {
4759 // Unaligned stores are extremely inefficient. We don't split all
4760 // unaligned 128-bit stores because the negative impact that has shown in
4761 // practice on inlined block copy code.
4762 // We make such stores expensive so that we will only vectorize if there
4763 // are 6 other instructions getting vectorized.
4764 const int AmortizationCost = 6;
4765
4766 return LT.first * 2 * AmortizationCost;
4767 }
4768
4769 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4770 if (Ty->isPtrOrPtrVectorTy())
4771 return LT.first;
4772
4773 if (useNeonVector(Ty)) {
4774 // Check truncating stores and extending loads.
4775 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4776 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4777 if (VT == MVT::v4i8)
4778 return 2;
4779 // Otherwise we need to scalarize.
4780 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4781 }
4782 EVT EltVT = VT.getVectorElementType();
4783 unsigned EltSize = EltVT.getScalarSizeInBits();
4784 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4785 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4786 return LT.first;
4787 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4788 // widening to v4i8, which produces suboptimal results.
4789 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4790 return LT.first;
4791
4792 // Check non-power-of-2 loads/stores for legal vector element types with
4793 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4794 // operations on smaller power-of-2 ops, including ld1/st1.
4795 LLVMContext &C = Ty->getContext();
4797 SmallVector<EVT> TypeWorklist;
4798 TypeWorklist.push_back(VT);
4799 while (!TypeWorklist.empty()) {
4800 EVT CurrVT = TypeWorklist.pop_back_val();
4801 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4802 if (isPowerOf2_32(CurrNumElements)) {
4803 Cost += 1;
4804 continue;
4805 }
4806
4807 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4808 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4809 TypeWorklist.push_back(
4810 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4811 }
4812 return Cost;
4813 }
4814
4815 return LT.first;
4816}
4817
4819 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4820 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4821 bool UseMaskForCond, bool UseMaskForGaps) const {
4822 assert(Factor >= 2 && "Invalid interleave factor");
4823 auto *VecVTy = cast<VectorType>(VecTy);
4824
4825 if (VecTy->isScalableTy() && !ST->hasSVE())
4827
4828 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4829 // only have lowering for power-of-2 factors.
4830 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4831 // InterleavedAccessPass for ld3/st3
4832 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4834
4835 // Vectorization for masked interleaved accesses is only enabled for scalable
4836 // VF.
4837 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4839
4840 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4841 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4842 auto *SubVecTy =
4843 VectorType::get(VecVTy->getElementType(),
4844 VecVTy->getElementCount().divideCoefficientBy(Factor));
4845
4846 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4847 // Accesses having vector types that are a multiple of 128 bits can be
4848 // matched to more than one ldN/stN instruction.
4849 bool UseScalable;
4850 if (MinElts % Factor == 0 &&
4851 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4852 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4853 }
4854
4855 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4856 Alignment, AddressSpace, CostKind,
4857 UseMaskForCond, UseMaskForGaps);
4858}
4859
4864 for (auto *I : Tys) {
4865 if (!I->isVectorTy())
4866 continue;
4867 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4868 128)
4869 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4870 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4871 }
4872 return Cost;
4873}
4874
4876 return ST->getMaxInterleaveFactor();
4877}
4878
4879// For Falkor, we want to avoid having too many strided loads in a loop since
4880// that can exhaust the HW prefetcher resources. We adjust the unroller
4881// MaxCount preference below to attempt to ensure unrolling doesn't create too
4882// many strided loads.
4883static void
4886 enum { MaxStridedLoads = 7 };
4887 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4888 int StridedLoads = 0;
4889 // FIXME? We could make this more precise by looking at the CFG and
4890 // e.g. not counting loads in each side of an if-then-else diamond.
4891 for (const auto BB : L->blocks()) {
4892 for (auto &I : *BB) {
4893 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4894 if (!LMemI)
4895 continue;
4896
4897 Value *PtrValue = LMemI->getPointerOperand();
4898 if (L->isLoopInvariant(PtrValue))
4899 continue;
4900
4901 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4902 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4903 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4904 continue;
4905
4906 // FIXME? We could take pairing of unrolled load copies into account
4907 // by looking at the AddRec, but we would probably have to limit this
4908 // to loops with no stores or other memory optimization barriers.
4909 ++StridedLoads;
4910 // We've seen enough strided loads that seeing more won't make a
4911 // difference.
4912 if (StridedLoads > MaxStridedLoads / 2)
4913 return StridedLoads;
4914 }
4915 }
4916 return StridedLoads;
4917 };
4918
4919 int StridedLoads = countStridedLoads(L, SE);
4920 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4921 << " strided loads\n");
4922 // Pick the largest power of 2 unroll count that won't result in too many
4923 // strided loads.
4924 if (StridedLoads) {
4925 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4926 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4927 << UP.MaxCount << '\n');
4928 }
4929}
4930
4931// This function returns true if the loop:
4932// 1. Has a valid cost, and
4933// 2. Has a cost within the supplied budget.
4934// Otherwise it returns false.
4936 InstructionCost Budget,
4937 unsigned *FinalSize) {
4938 // Estimate the size of the loop.
4939 InstructionCost LoopCost = 0;
4940
4941 for (auto *BB : L->getBlocks()) {
4942 for (auto &I : *BB) {
4943 SmallVector<const Value *, 4> Operands(I.operand_values());
4944 InstructionCost Cost =
4945 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4946 // This can happen with intrinsics that don't currently have a cost model
4947 // or for some operations that require SVE.
4948 if (!Cost.isValid())
4949 return false;
4950
4951 LoopCost += Cost;
4952 if (LoopCost > Budget)
4953 return false;
4954 }
4955 }
4956
4957 if (FinalSize)
4958 *FinalSize = LoopCost.getValue();
4959 return true;
4960}
4961
4963 const AArch64TTIImpl &TTI) {
4964 // Only consider loops with unknown trip counts for which we can determine
4965 // a symbolic expression. Multi-exit loops with small known trip counts will
4966 // likely be unrolled anyway.
4967 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4969 return false;
4970
4971 // It might not be worth unrolling loops with low max trip counts. Restrict
4972 // this to max trip counts > 32 for now.
4973 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4974 if (MaxTC > 0 && MaxTC <= 32)
4975 return false;
4976
4977 // Make sure the loop size is <= 5.
4978 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4979 return false;
4980
4981 // Small search loops with multiple exits can be highly beneficial to unroll.
4982 // We only care about loops with exactly two exiting blocks, although each
4983 // block could jump to the same exit block.
4984 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4985 if (Blocks.size() != 2)
4986 return false;
4987
4988 if (any_of(Blocks, [](BasicBlock *BB) {
4989 return !isa<BranchInst>(BB->getTerminator());
4990 }))
4991 return false;
4992
4993 return true;
4994}
4995
4996/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4997/// OOO engine's wide instruction window and various predictors.
4998static void
5001 const AArch64TTIImpl &TTI) {
5002 // Limit loops with structure that is highly likely to benefit from runtime
5003 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5004 // likely with complex control flow). Note that the heuristics here may be
5005 // overly conservative and we err on the side of avoiding runtime unrolling
5006 // rather than unroll excessively. They are all subject to further refinement.
5007 if (!L->isInnermost() || L->getNumBlocks() > 8)
5008 return;
5009
5010 // Loops with multiple exits are handled by common code.
5011 if (!L->getExitBlock())
5012 return;
5013
5014 // Check if the loop contains any reductions that could be parallelized when
5015 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5016 // a multiple of 2.
5017 bool HasParellelizableReductions =
5018 L->getNumBlocks() == 1 &&
5019 any_of(L->getHeader()->phis(),
5020 [&SE, L](PHINode &Phi) {
5021 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5022 }) &&
5023 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5024 if (HasParellelizableReductions &&
5025 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5026 UP.Partial = true;
5027 UP.MaxCount = 4;
5028 UP.AddAdditionalAccumulators = true;
5029 }
5030
5031 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5033 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5034 SE.getSmallConstantMaxTripCount(L) <= 32))
5035 return;
5036
5037 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5038 return;
5039
5041 return;
5042
5043 // Limit to loops with trip counts that are cheap to expand.
5044 UP.SCEVExpansionBudget = 1;
5045
5046 if (HasParellelizableReductions) {
5047 UP.Runtime = true;
5049 UP.AddAdditionalAccumulators = true;
5050 }
5051
5052 // Try to unroll small loops, of few-blocks with low budget, if they have
5053 // load/store dependencies, to expose more parallel memory access streams,
5054 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5055 BasicBlock *Header = L->getHeader();
5056 BasicBlock *Latch = L->getLoopLatch();
5057 if (Header == Latch) {
5058 // Estimate the size of the loop.
5059 unsigned Size;
5060 unsigned Width = 10;
5061 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5062 return;
5063
5064 // Try to find an unroll count that maximizes the use of the instruction
5065 // window, i.e. trying to fetch as many instructions per cycle as possible.
5066 unsigned MaxInstsPerLine = 16;
5067 unsigned UC = 1;
5068 unsigned BestUC = 1;
5069 unsigned SizeWithBestUC = BestUC * Size;
5070 while (UC <= 8) {
5071 unsigned SizeWithUC = UC * Size;
5072 if (SizeWithUC > 48)
5073 break;
5074 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5075 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5076 BestUC = UC;
5077 SizeWithBestUC = BestUC * Size;
5078 }
5079 UC++;
5080 }
5081
5082 if (BestUC == 1)
5083 return;
5084
5085 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5087 for (auto *BB : L->blocks()) {
5088 for (auto &I : *BB) {
5090 if (!Ptr)
5091 continue;
5092 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5093 if (SE.isLoopInvariant(PtrSCEV, L))
5094 continue;
5095 if (isa<LoadInst>(&I)) {
5096 LoadedValuesPlus.insert(&I);
5097 // Include in-loop 1st users of loaded values.
5098 for (auto *U : I.users())
5099 if (L->contains(cast<Instruction>(U)))
5100 LoadedValuesPlus.insert(U);
5101 } else
5102 Stores.push_back(cast<StoreInst>(&I));
5103 }
5104 }
5105
5106 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5107 return LoadedValuesPlus.contains(SI->getOperand(0));
5108 }))
5109 return;
5110
5111 UP.Runtime = true;
5112 UP.DefaultUnrollRuntimeCount = BestUC;
5113 return;
5114 }
5115
5116 // Try to runtime-unroll loops with early-continues depending on loop-varying
5117 // loads; this helps with branch-prediction for the early-continues.
5118 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5120 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5121 !llvm::is_contained(Preds, Header) ||
5122 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5123 return;
5124
5125 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5126 [&](Instruction *I, unsigned Depth) -> bool {
5127 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5128 return false;
5129
5130 if (isa<LoadInst>(I))
5131 return true;
5132
5133 return any_of(I->operands(), [&](Value *V) {
5134 auto *I = dyn_cast<Instruction>(V);
5135 return I && DependsOnLoopLoad(I, Depth + 1);
5136 });
5137 };
5138 CmpPredicate Pred;
5139 Instruction *I;
5140 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5141 m_Value())) &&
5142 DependsOnLoopLoad(I, 0)) {
5143 UP.Runtime = true;
5144 }
5145}
5146
5149 OptimizationRemarkEmitter *ORE) const {
5150 // Enable partial unrolling and runtime unrolling.
5151 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5152
5153 UP.UpperBound = true;
5154
5155 // For inner loop, it is more likely to be a hot one, and the runtime check
5156 // can be promoted out from LICM pass, so the overhead is less, let's try
5157 // a larger threshold to unroll more loops.
5158 if (L->getLoopDepth() > 1)
5159 UP.PartialThreshold *= 2;
5160
5161 // Disable partial & runtime unrolling on -Os.
5163
5164 // Scan the loop: don't unroll loops with calls as this could prevent
5165 // inlining. Don't unroll auto-vectorized loops either, though do allow
5166 // unrolling of the scalar remainder.
5167 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5168 for (auto *BB : L->getBlocks()) {
5169 for (auto &I : *BB) {
5170 // Both auto-vectorized loops and the scalar remainder have the
5171 // isvectorized attribute, so differentiate between them by the presence
5172 // of vector instructions.
5173 if (IsVectorized && I.getType()->isVectorTy())
5174 return;
5175 if (isa<CallBase>(I)) {
5178 if (!isLoweredToCall(F))
5179 continue;
5180 return;
5181 }
5182 }
5183 }
5184
5185 // Apply subtarget-specific unrolling preferences.
5186 switch (ST->getProcFamily()) {
5187 case AArch64Subtarget::AppleA14:
5188 case AArch64Subtarget::AppleA15:
5189 case AArch64Subtarget::AppleA16:
5190 case AArch64Subtarget::AppleM4:
5191 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5192 break;
5193 case AArch64Subtarget::Falkor:
5196 break;
5197 default:
5198 break;
5199 }
5200
5201 // If this is a small, multi-exit loop similar to something like std::find,
5202 // then there is typically a performance improvement achieved by unrolling.
5203 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5204 UP.RuntimeUnrollMultiExit = true;
5205 UP.Runtime = true;
5206 // Limit unroll count.
5208 // Allow slightly more costly trip-count expansion to catch search loops
5209 // with pointer inductions.
5210 UP.SCEVExpansionBudget = 5;
5211 return;
5212 }
5213
5214 // Enable runtime unrolling for in-order models
5215 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5216 // checking for that case, we can ensure that the default behaviour is
5217 // unchanged
5218 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5219 !ST->getSchedModel().isOutOfOrder()) {
5220 UP.Runtime = true;
5221 UP.Partial = true;
5222 UP.UnrollRemainder = true;
5224
5225 UP.UnrollAndJam = true;
5227 }
5228}
5229
5234
5236 Type *ExpectedType,
5237 bool CanCreate) const {
5238 switch (Inst->getIntrinsicID()) {
5239 default:
5240 return nullptr;
5241 case Intrinsic::aarch64_neon_st2:
5242 case Intrinsic::aarch64_neon_st3:
5243 case Intrinsic::aarch64_neon_st4: {
5244 // Create a struct type
5245 StructType *ST = dyn_cast<StructType>(ExpectedType);
5246 if (!CanCreate || !ST)
5247 return nullptr;
5248 unsigned NumElts = Inst->arg_size() - 1;
5249 if (ST->getNumElements() != NumElts)
5250 return nullptr;
5251 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5252 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5253 return nullptr;
5254 }
5255 Value *Res = PoisonValue::get(ExpectedType);
5256 IRBuilder<> Builder(Inst);
5257 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5258 Value *L = Inst->getArgOperand(i);
5259 Res = Builder.CreateInsertValue(Res, L, i);
5260 }
5261 return Res;
5262 }
5263 case Intrinsic::aarch64_neon_ld2:
5264 case Intrinsic::aarch64_neon_ld3:
5265 case Intrinsic::aarch64_neon_ld4:
5266 if (Inst->getType() == ExpectedType)
5267 return Inst;
5268 return nullptr;
5269 }
5270}
5271
5273 MemIntrinsicInfo &Info) const {
5274 switch (Inst->getIntrinsicID()) {
5275 default:
5276 break;
5277 case Intrinsic::aarch64_neon_ld2:
5278 case Intrinsic::aarch64_neon_ld3:
5279 case Intrinsic::aarch64_neon_ld4:
5280 Info.ReadMem = true;
5281 Info.WriteMem = false;
5282 Info.PtrVal = Inst->getArgOperand(0);
5283 break;
5284 case Intrinsic::aarch64_neon_st2:
5285 case Intrinsic::aarch64_neon_st3:
5286 case Intrinsic::aarch64_neon_st4:
5287 Info.ReadMem = false;
5288 Info.WriteMem = true;
5289 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5290 break;
5291 }
5292
5293 switch (Inst->getIntrinsicID()) {
5294 default:
5295 return false;
5296 case Intrinsic::aarch64_neon_ld2:
5297 case Intrinsic::aarch64_neon_st2:
5298 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5299 break;
5300 case Intrinsic::aarch64_neon_ld3:
5301 case Intrinsic::aarch64_neon_st3:
5302 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5303 break;
5304 case Intrinsic::aarch64_neon_ld4:
5305 case Intrinsic::aarch64_neon_st4:
5306 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5307 break;
5308 }
5309 return true;
5310}
5311
5312/// See if \p I should be considered for address type promotion. We check if \p
5313/// I is a sext with right type and used in memory accesses. If it used in a
5314/// "complex" getelementptr, we allow it to be promoted without finding other
5315/// sext instructions that sign extended the same initial value. A getelementptr
5316/// is considered as "complex" if it has more than 2 operands.
5318 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5319 bool Considerable = false;
5320 AllowPromotionWithoutCommonHeader = false;
5321 if (!isa<SExtInst>(&I))
5322 return false;
5323 Type *ConsideredSExtType =
5324 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5325 if (I.getType() != ConsideredSExtType)
5326 return false;
5327 // See if the sext is the one with the right type and used in at least one
5328 // GetElementPtrInst.
5329 for (const User *U : I.users()) {
5330 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5331 Considerable = true;
5332 // A getelementptr is considered as "complex" if it has more than 2
5333 // operands. We will promote a SExt used in such complex GEP as we
5334 // expect some computation to be merged if they are done on 64 bits.
5335 if (GEPInst->getNumOperands() > 2) {
5336 AllowPromotionWithoutCommonHeader = true;
5337 break;
5338 }
5339 }
5340 }
5341 return Considerable;
5342}
5343
5345 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5346 if (!VF.isScalable())
5347 return true;
5348
5349 Type *Ty = RdxDesc.getRecurrenceType();
5350 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5351 return false;
5352
5353 switch (RdxDesc.getRecurrenceKind()) {
5354 case RecurKind::Sub:
5356 case RecurKind::Add:
5357 case RecurKind::FAdd:
5358 case RecurKind::And:
5359 case RecurKind::Or:
5360 case RecurKind::Xor:
5361 case RecurKind::SMin:
5362 case RecurKind::SMax:
5363 case RecurKind::UMin:
5364 case RecurKind::UMax:
5365 case RecurKind::FMin:
5366 case RecurKind::FMax:
5367 case RecurKind::FMulAdd:
5368 case RecurKind::AnyOf:
5369 return true;
5370 default:
5371 return false;
5372 }
5373}
5374
5377 FastMathFlags FMF,
5379 // The code-generator is currently not able to handle scalable vectors
5380 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5381 // it. This change will be removed when code-generation for these types is
5382 // sufficiently reliable.
5383 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5384 if (VTy->getElementCount() == ElementCount::getScalable(1))
5386
5387 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5388
5389 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5390 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5391
5392 InstructionCost LegalizationCost = 0;
5393 if (LT.first > 1) {
5394 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5395 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5396 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5397 }
5398
5399 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5400}
5401
5403 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5404 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5405 InstructionCost LegalizationCost = 0;
5406 if (LT.first > 1) {
5407 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5408 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5409 LegalizationCost *= LT.first - 1;
5410 }
5411
5412 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5413 assert(ISD && "Invalid opcode");
5414 // Add the final reduction cost for the legal horizontal reduction
5415 switch (ISD) {
5416 case ISD::ADD:
5417 case ISD::AND:
5418 case ISD::OR:
5419 case ISD::XOR:
5420 case ISD::FADD:
5421 return LegalizationCost + 2;
5422 default:
5424 }
5425}
5426
5429 std::optional<FastMathFlags> FMF,
5431 // The code-generator is currently not able to handle scalable vectors
5432 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5433 // it. This change will be removed when code-generation for these types is
5434 // sufficiently reliable.
5435 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5436 if (VTy->getElementCount() == ElementCount::getScalable(1))
5438
5440 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5441 InstructionCost BaseCost =
5442 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5443 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5444 // end up vectorizing for more computationally intensive loops.
5445 return BaseCost + FixedVTy->getNumElements();
5446 }
5447
5448 if (Opcode != Instruction::FAdd)
5450
5451 auto *VTy = cast<ScalableVectorType>(ValTy);
5453 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5454 Cost *= getMaxNumElements(VTy->getElementCount());
5455 return Cost;
5456 }
5457
5458 if (isa<ScalableVectorType>(ValTy))
5459 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5460
5461 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5462 MVT MTy = LT.second;
5463 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5464 assert(ISD && "Invalid opcode");
5465
5466 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5467 // instructions as twice a normal vector add, plus 1 for each legalization
5468 // step (LT.first). This is the only arithmetic vector reduction operation for
5469 // which we have an instruction.
5470 // OR, XOR and AND costs should match the codegen from:
5471 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5472 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5473 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5474 static const CostTblEntry CostTblNoPairwise[]{
5475 {ISD::ADD, MVT::v8i8, 2},
5476 {ISD::ADD, MVT::v16i8, 2},
5477 {ISD::ADD, MVT::v4i16, 2},
5478 {ISD::ADD, MVT::v8i16, 2},
5479 {ISD::ADD, MVT::v2i32, 2},
5480 {ISD::ADD, MVT::v4i32, 2},
5481 {ISD::ADD, MVT::v2i64, 2},
5482 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5483 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5484 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5485 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5486 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5487 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5488 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5489 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5490 {ISD::XOR, MVT::v16i8, 7},
5491 {ISD::XOR, MVT::v4i16, 4},
5492 {ISD::XOR, MVT::v8i16, 6},
5493 {ISD::XOR, MVT::v2i32, 3},
5494 {ISD::XOR, MVT::v4i32, 5},
5495 {ISD::XOR, MVT::v2i64, 3},
5496 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5497 {ISD::AND, MVT::v16i8, 7},
5498 {ISD::AND, MVT::v4i16, 4},
5499 {ISD::AND, MVT::v8i16, 6},
5500 {ISD::AND, MVT::v2i32, 3},
5501 {ISD::AND, MVT::v4i32, 5},
5502 {ISD::AND, MVT::v2i64, 3},
5503 };
5504 switch (ISD) {
5505 default:
5506 break;
5507 case ISD::FADD:
5508 if (Type *EltTy = ValTy->getScalarType();
5509 // FIXME: For half types without fullfp16 support, this could extend and
5510 // use a fp32 faddp reduction but current codegen unrolls.
5511 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5512 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5513 const unsigned NElts = MTy.getVectorNumElements();
5514 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5515 isPowerOf2_32(NElts))
5516 // Reduction corresponding to series of fadd instructions is lowered to
5517 // series of faddp instructions. faddp has latency/throughput that
5518 // matches fadd instruction and hence, every faddp instruction can be
5519 // considered to have a relative cost = 1 with
5520 // CostKind = TCK_RecipThroughput.
5521 // An faddp will pairwise add vector elements, so the size of input
5522 // vector reduces by half every time, requiring
5523 // #(faddp instructions) = log2_32(NElts).
5524 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5525 }
5526 break;
5527 case ISD::ADD:
5528 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5529 return (LT.first - 1) + Entry->Cost;
5530 break;
5531 case ISD::XOR:
5532 case ISD::AND:
5533 case ISD::OR:
5534 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5535 if (!Entry)
5536 break;
5537 auto *ValVTy = cast<FixedVectorType>(ValTy);
5538 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5539 isPowerOf2_32(ValVTy->getNumElements())) {
5540 InstructionCost ExtraCost = 0;
5541 if (LT.first != 1) {
5542 // Type needs to be split, so there is an extra cost of LT.first - 1
5543 // arithmetic ops.
5544 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5545 MTy.getVectorNumElements());
5546 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5547 ExtraCost *= LT.first - 1;
5548 }
5549 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5550 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5551 return Cost + ExtraCost;
5552 }
5553 break;
5554 }
5555 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5556}
5557
5559 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5560 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5561 EVT VecVT = TLI->getValueType(DL, VecTy);
5562 EVT ResVT = TLI->getValueType(DL, ResTy);
5563
5564 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5565 VecVT.getSizeInBits() >= 64) {
5566 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5567
5568 // The legal cases are:
5569 // UADDLV 8/16/32->32
5570 // UADDLP 32->64
5571 unsigned RevVTSize = ResVT.getSizeInBits();
5572 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5573 RevVTSize <= 32) ||
5574 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5575 RevVTSize <= 32) ||
5576 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5577 RevVTSize <= 64))
5578 return (LT.first - 1) * 2 + 2;
5579 }
5580
5581 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5582 CostKind);
5583}
5584
5586AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5587 Type *ResTy, VectorType *VecTy,
5589 EVT VecVT = TLI->getValueType(DL, VecTy);
5590 EVT ResVT = TLI->getValueType(DL, ResTy);
5591
5592 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5593 RedOpcode == Instruction::Add) {
5594 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5595
5596 // The legal cases with dotprod are
5597 // UDOT 8->32
5598 // Which requires an additional uaddv to sum the i32 values.
5599 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5600 ResVT == MVT::i32)
5601 return LT.first + 2;
5602 }
5603
5604 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5605 CostKind);
5606}
5607
5611 static const CostTblEntry ShuffleTbl[] = {
5612 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5613 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5614 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5615 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5616 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5617 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5618 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5619 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5620 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5621 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5622 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5623 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5624 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5625 };
5626
5627 // The code-generator is currently not able to handle scalable vectors
5628 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5629 // it. This change will be removed when code-generation for these types is
5630 // sufficiently reliable.
5633
5634 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5635 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5636 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5637 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5638 : LT.second;
5639 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5640 InstructionCost LegalizationCost = 0;
5641 if (Index < 0) {
5642 LegalizationCost =
5643 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5645 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5647 }
5648
5649 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5650 // Cost performed on a promoted type.
5651 if (LT.second.getScalarType() == MVT::i1) {
5652 LegalizationCost +=
5653 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5655 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5657 }
5658 const auto *Entry =
5659 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5660 assert(Entry && "Illegal Type for Splice");
5661 LegalizationCost += Entry->Cost;
5662 return LegalizationCost * LT.first;
5663}
5664
5666 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5668 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5671
5673 return Invalid;
5674
5675 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5676 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5677 return Invalid;
5678
5679 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5680 OpAExtend == TTI::PR_None)
5681 return Invalid;
5682
5683 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5684 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5685 "Unexpected values for OpBExtend or InputTypeB");
5686
5687 // We only support multiply binary operations for now, and for muls we
5688 // require the types being extended to be the same.
5689 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5690 return Invalid;
5691
5692 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5693 if (IsUSDot && !ST->hasMatMulInt8())
5694 return Invalid;
5695
5696 unsigned Ratio =
5697 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5698 if (VF.getKnownMinValue() <= Ratio)
5699 return Invalid;
5700
5701 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5702 VectorType *AccumVectorType =
5703 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5704 // We don't yet support all kinds of legalization.
5705 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5706 EVT::getEVT(AccumVectorType));
5707 switch (TC.first) {
5708 default:
5709 return Invalid;
5713 // The legalised type (e.g. after splitting) must be legal too.
5714 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5716 return Invalid;
5717 break;
5718 }
5719
5720 std::pair<InstructionCost, MVT> AccumLT =
5721 getTypeLegalizationCost(AccumVectorType);
5722 std::pair<InstructionCost, MVT> InputLT =
5723 getTypeLegalizationCost(InputVectorType);
5724
5725 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5726
5727 // Prefer using full types by costing half-full input types as more expensive.
5728 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5730 // FIXME: This can be removed after the cost of the extends are folded into
5731 // the dot-product expression in VPlan, after landing:
5732 // https://github.com/llvm/llvm-project/pull/147302
5733 Cost *= 2;
5734
5735 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5736 // i16 -> i64 is natively supported for udot/sdot
5737 if (AccumLT.second.getScalarType() == MVT::i64 &&
5738 InputLT.second.getScalarType() == MVT::i16)
5739 return Cost;
5740 // i8 -> i64 is supported with an extra level of extends
5741 if (AccumLT.second.getScalarType() == MVT::i64 &&
5742 InputLT.second.getScalarType() == MVT::i8)
5743 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5744 // because it requires two extra extends on the inputs. But if we'd change
5745 // that now, a regular reduction would be cheaper because the costs of
5746 // the extends in the IR are still counted. This can be fixed
5747 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5748 return Cost;
5749 }
5750
5751 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5752 if (ST->isSVEorStreamingSVEAvailable() ||
5753 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5754 ST->hasDotProd())) {
5755 if (AccumLT.second.getScalarType() == MVT::i32 &&
5756 InputLT.second.getScalarType() == MVT::i8)
5757 return Cost;
5758 }
5759
5760 // Add additional cost for the extends that would need to be inserted.
5761 return Cost + 2;
5762}
5763
5766 VectorType *SrcTy, ArrayRef<int> Mask,
5767 TTI::TargetCostKind CostKind, int Index,
5769 const Instruction *CxtI) const {
5770 assert((Mask.empty() || DstTy->isScalableTy() ||
5771 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5772 "Expected the Mask to match the return size if given");
5773 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5774 "Expected the same scalar types");
5775 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5776
5777 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5778 // into smaller vectors and sum the cost of each shuffle.
5779 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5780 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5781 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5782 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5783 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5784 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5785 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5786 // cost than just the load.
5787 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5790 return std::max<InstructionCost>(1, LT.first / 4);
5791
5792 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5793 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5794 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5795 // cost than just the store.
5796 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5798 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5800 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5801 return LT.first;
5802
5803 unsigned TpNumElts = Mask.size();
5804 unsigned LTNumElts = LT.second.getVectorNumElements();
5805 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5806 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5807 LT.second.getVectorElementCount());
5809 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5810 PreviousCosts;
5811 for (unsigned N = 0; N < NumVecs; N++) {
5812 SmallVector<int> NMask;
5813 // Split the existing mask into chunks of size LTNumElts. Track the source
5814 // sub-vectors to ensure the result has at most 2 inputs.
5815 unsigned Source1 = -1U, Source2 = -1U;
5816 unsigned NumSources = 0;
5817 for (unsigned E = 0; E < LTNumElts; E++) {
5818 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5820 if (MaskElt < 0) {
5822 continue;
5823 }
5824
5825 // Calculate which source from the input this comes from and whether it
5826 // is new to us.
5827 unsigned Source = MaskElt / LTNumElts;
5828 if (NumSources == 0) {
5829 Source1 = Source;
5830 NumSources = 1;
5831 } else if (NumSources == 1 && Source != Source1) {
5832 Source2 = Source;
5833 NumSources = 2;
5834 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5835 NumSources++;
5836 }
5837
5838 // Add to the new mask. For the NumSources>2 case these are not correct,
5839 // but are only used for the modular lane number.
5840 if (Source == Source1)
5841 NMask.push_back(MaskElt % LTNumElts);
5842 else if (Source == Source2)
5843 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5844 else
5845 NMask.push_back(MaskElt % LTNumElts);
5846 }
5847 // Check if we have already generated this sub-shuffle, which means we
5848 // will have already generated the output. For example a <16 x i32> splat
5849 // will be the same sub-splat 4 times, which only needs to be generated
5850 // once and reused.
5851 auto Result =
5852 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5853 // Check if it was already in the map (already costed).
5854 if (!Result.second)
5855 continue;
5856 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5857 // getShuffleCost. If not then cost it using the worst case as the number
5858 // of element moves into a new vector.
5859 InstructionCost NCost =
5860 NumSources <= 2
5861 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5863 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5864 CxtI)
5865 : LTNumElts;
5866 Result.first->second = NCost;
5867 Cost += NCost;
5868 }
5869 return Cost;
5870 }
5871
5872 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5873 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5874 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
5875 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
5876 // This currently only handles low or high extracts to prevent SLP vectorizer
5877 // regressions.
5878 // Note that SVE's ext instruction is destructive, but it can be fused with
5879 // a movprfx to act like a constructive instruction.
5880 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5881 if (LT.second.getFixedSizeInBits() >= 128 &&
5882 cast<FixedVectorType>(SubTp)->getNumElements() ==
5883 LT.second.getVectorNumElements() / 2) {
5884 if (Index == 0)
5885 return 0;
5886 if (Index == (int)LT.second.getVectorNumElements() / 2)
5887 return 1;
5888 }
5890 }
5891 // FIXME: This was added to keep the costs equal when adding DstTys. Update
5892 // the code to handle length-changing shuffles.
5893 if (Kind == TTI::SK_InsertSubvector) {
5894 LT = getTypeLegalizationCost(DstTy);
5895 SrcTy = DstTy;
5896 }
5897
5898 // Segmented shuffle matching.
5899 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
5900 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5901 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5903
5905 unsigned Segments =
5907 unsigned SegmentElts = VTy->getNumElements() / Segments;
5908
5909 // dupq zd.t, zn.t[idx]
5910 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5911 ST->isSVEorStreamingSVEAvailable() &&
5912 isDUPQMask(Mask, Segments, SegmentElts))
5913 return LT.first;
5914
5915 // mov zd.q, vn
5916 if (ST->isSVEorStreamingSVEAvailable() &&
5917 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
5918 return LT.first;
5919 }
5920
5921 // Check for broadcast loads, which are supported by the LD1R instruction.
5922 // In terms of code-size, the shuffle vector is free when a load + dup get
5923 // folded into a LD1R. That's what we check and return here. For performance
5924 // and reciprocal throughput, a LD1R is not completely free. In this case, we
5925 // return the cost for the broadcast below (i.e. 1 for most/all types), so
5926 // that we model the load + dup sequence slightly higher because LD1R is a
5927 // high latency instruction.
5928 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
5929 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
5930 if (IsLoad && LT.second.isVector() &&
5931 isLegalBroadcastLoad(SrcTy->getElementType(),
5932 LT.second.getVectorElementCount()))
5933 return 0;
5934 }
5935
5936 // If we have 4 elements for the shuffle and a Mask, get the cost straight
5937 // from the perfect shuffle tables.
5938 if (Mask.size() == 4 &&
5939 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
5940 (SrcTy->getScalarSizeInBits() == 16 ||
5941 SrcTy->getScalarSizeInBits() == 32) &&
5942 all_of(Mask, [](int E) { return E < 8; }))
5943 return getPerfectShuffleCost(Mask);
5944
5945 // Check for identity masks, which we can treat as free.
5946 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5947 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5948 all_of(enumerate(Mask), [](const auto &M) {
5949 return M.value() < 0 || M.value() == (int)M.index();
5950 }))
5951 return 0;
5952
5953 // Check for other shuffles that are not SK_ kinds but we have native
5954 // instructions for, for example ZIP and UZP.
5955 unsigned Unused;
5956 if (LT.second.isFixedLengthVector() &&
5957 LT.second.getVectorNumElements() == Mask.size() &&
5958 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
5959 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5960 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
5961 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5962 LT.second.getVectorNumElements(), 16) ||
5963 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5964 LT.second.getVectorNumElements(), 32) ||
5965 isREVMask(Mask, LT.second.getScalarSizeInBits(),
5966 LT.second.getVectorNumElements(), 64) ||
5967 // Check for non-zero lane splats
5968 all_of(drop_begin(Mask),
5969 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
5970 return 1;
5971
5972 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
5973 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
5974 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
5975 static const CostTblEntry ShuffleTbl[] = {
5976 // Broadcast shuffle kinds can be performed with 'dup'.
5977 {TTI::SK_Broadcast, MVT::v8i8, 1},
5978 {TTI::SK_Broadcast, MVT::v16i8, 1},
5979 {TTI::SK_Broadcast, MVT::v4i16, 1},
5980 {TTI::SK_Broadcast, MVT::v8i16, 1},
5981 {TTI::SK_Broadcast, MVT::v2i32, 1},
5982 {TTI::SK_Broadcast, MVT::v4i32, 1},
5983 {TTI::SK_Broadcast, MVT::v2i64, 1},
5984 {TTI::SK_Broadcast, MVT::v4f16, 1},
5985 {TTI::SK_Broadcast, MVT::v8f16, 1},
5986 {TTI::SK_Broadcast, MVT::v4bf16, 1},
5987 {TTI::SK_Broadcast, MVT::v8bf16, 1},
5988 {TTI::SK_Broadcast, MVT::v2f32, 1},
5989 {TTI::SK_Broadcast, MVT::v4f32, 1},
5990 {TTI::SK_Broadcast, MVT::v2f64, 1},
5991 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
5992 // 'zip1/zip2' instructions.
5993 {TTI::SK_Transpose, MVT::v8i8, 1},
5994 {TTI::SK_Transpose, MVT::v16i8, 1},
5995 {TTI::SK_Transpose, MVT::v4i16, 1},
5996 {TTI::SK_Transpose, MVT::v8i16, 1},
5997 {TTI::SK_Transpose, MVT::v2i32, 1},
5998 {TTI::SK_Transpose, MVT::v4i32, 1},
5999 {TTI::SK_Transpose, MVT::v2i64, 1},
6000 {TTI::SK_Transpose, MVT::v4f16, 1},
6001 {TTI::SK_Transpose, MVT::v8f16, 1},
6002 {TTI::SK_Transpose, MVT::v4bf16, 1},
6003 {TTI::SK_Transpose, MVT::v8bf16, 1},
6004 {TTI::SK_Transpose, MVT::v2f32, 1},
6005 {TTI::SK_Transpose, MVT::v4f32, 1},
6006 {TTI::SK_Transpose, MVT::v2f64, 1},
6007 // Select shuffle kinds.
6008 // TODO: handle vXi8/vXi16.
6009 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6010 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6011 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6012 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6013 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6014 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6015 // PermuteSingleSrc shuffle kinds.
6016 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6017 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6018 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6019 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6020 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6021 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6022 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6023 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6024 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6025 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6026 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6027 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6028 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6029 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6030 // Reverse can be lowered with `rev`.
6031 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6032 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6033 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6034 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6035 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6036 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6037 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6038 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6039 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6040 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6041 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6042 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6043 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6044 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6045 // Splice can all be lowered as `ext`.
6046 {TTI::SK_Splice, MVT::v2i32, 1},
6047 {TTI::SK_Splice, MVT::v4i32, 1},
6048 {TTI::SK_Splice, MVT::v2i64, 1},
6049 {TTI::SK_Splice, MVT::v2f32, 1},
6050 {TTI::SK_Splice, MVT::v4f32, 1},
6051 {TTI::SK_Splice, MVT::v2f64, 1},
6052 {TTI::SK_Splice, MVT::v8f16, 1},
6053 {TTI::SK_Splice, MVT::v8bf16, 1},
6054 {TTI::SK_Splice, MVT::v8i16, 1},
6055 {TTI::SK_Splice, MVT::v16i8, 1},
6056 {TTI::SK_Splice, MVT::v4f16, 1},
6057 {TTI::SK_Splice, MVT::v4bf16, 1},
6058 {TTI::SK_Splice, MVT::v4i16, 1},
6059 {TTI::SK_Splice, MVT::v8i8, 1},
6060 // Broadcast shuffle kinds for scalable vectors
6061 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6062 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6063 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6064 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6065 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6066 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6067 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6068 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6069 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6070 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6071 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6072 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6073 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6074 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6075 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6076 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6077 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6078 // Handle the cases for vector.reverse with scalable vectors
6079 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6080 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6081 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6082 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6083 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6084 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6085 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6086 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6087 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6088 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6089 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6090 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6091 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6092 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6093 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6094 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6095 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6096 };
6097 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6098 return LT.first * Entry->Cost;
6099 }
6100
6101 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6102 return getSpliceCost(SrcTy, Index, CostKind);
6103
6104 // Inserting a subvector can often be done with either a D, S or H register
6105 // move, so long as the inserted vector is "aligned".
6106 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6107 LT.second.getSizeInBits() <= 128 && SubTp) {
6108 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6109 if (SubLT.second.isVector()) {
6110 int NumElts = LT.second.getVectorNumElements();
6111 int NumSubElts = SubLT.second.getVectorNumElements();
6112 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6113 return SubLT.first;
6114 }
6115 }
6116
6117 // Restore optimal kind.
6118 if (IsExtractSubvector)
6120 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6121 Args, CxtI);
6122}
6123
6126 const auto &Strides = DenseMap<Value *, const SCEV *>();
6127 for (BasicBlock *BB : TheLoop->blocks()) {
6128 // Scan the instructions in the block and look for addresses that are
6129 // consecutive and decreasing.
6130 for (Instruction &I : *BB) {
6131 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6133 Type *AccessTy = getLoadStoreType(&I);
6134 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
6135 /*ShouldCheckWrap=*/false)
6136 .value_or(0) < 0)
6137 return true;
6138 }
6139 }
6140 }
6141 return false;
6142}
6143
6145 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6147 // For cases like post-LTO vectorization, when we eventually know the trip
6148 // count, epilogue with fixed-width vectorization can be deleted if the trip
6149 // count is less than the epilogue iterations. That's why we prefer
6150 // fixed-width vectorization in epilogue in case of equal costs.
6151 if (IsEpilogue)
6152 return true;
6153 return ST->useFixedOverScalableIfEqualCost();
6154}
6155
6157 return ST->getEpilogueVectorizationMinVF();
6158}
6159
6161 if (!ST->hasSVE())
6162 return false;
6163
6164 // We don't currently support vectorisation with interleaving for SVE - with
6165 // such loops we're better off not using tail-folding. This gives us a chance
6166 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6167 if (TFI->IAI->hasGroups())
6168 return false;
6169
6171 if (TFI->LVL->getReductionVars().size())
6173 if (TFI->LVL->getFixedOrderRecurrences().size())
6175
6176 // We call this to discover whether any load/store pointers in the loop have
6177 // negative strides. This will require extra work to reverse the loop
6178 // predicate, which may be expensive.
6184
6185 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6186 Required))
6187 return false;
6188
6189 // Don't tail-fold for tight loops where we would be better off interleaving
6190 // with an unpredicated loop.
6191 unsigned NumInsns = 0;
6192 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6193 NumInsns += BB->sizeWithoutDebug();
6194 }
6195
6196 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6197 return NumInsns >= SVETailFoldInsnThreshold;
6198}
6199
6202 StackOffset BaseOffset, bool HasBaseReg,
6203 int64_t Scale, unsigned AddrSpace) const {
6204 // Scaling factors are not free at all.
6205 // Operands | Rt Latency
6206 // -------------------------------------------
6207 // Rt, [Xn, Xm] | 4
6208 // -------------------------------------------
6209 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6210 // Rt, [Xn, Wm, <extend> #imm] |
6212 AM.BaseGV = BaseGV;
6213 AM.BaseOffs = BaseOffset.getFixed();
6214 AM.HasBaseReg = HasBaseReg;
6215 AM.Scale = Scale;
6216 AM.ScalableOffset = BaseOffset.getScalable();
6217 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6218 // Scale represents reg2 * scale, thus account for 1 if
6219 // it is not equal to 0 or 1.
6220 return AM.Scale != 0 && AM.Scale != 1;
6222}
6223
6225 const Instruction *I) const {
6227 // For the binary operators (e.g. or) we need to be more careful than
6228 // selects, here we only transform them if they are already at a natural
6229 // break point in the code - the end of a block with an unconditional
6230 // terminator.
6231 if (I->getOpcode() == Instruction::Or &&
6232 isa<BranchInst>(I->getNextNode()) &&
6233 cast<BranchInst>(I->getNextNode())->isUnconditional())
6234 return true;
6235
6236 if (I->getOpcode() == Instruction::Add ||
6237 I->getOpcode() == Instruction::Sub)
6238 return true;
6239 }
6241}
6242
6245 const TargetTransformInfo::LSRCost &C2) const {
6246 // AArch64 specific here is adding the number of instructions to the
6247 // comparison (though not as the first consideration, as some targets do)
6248 // along with changing the priority of the base additions.
6249 // TODO: Maybe a more nuanced tradeoff between instruction count
6250 // and number of registers? To be investigated at a later date.
6251 if (EnableLSRCostOpt)
6252 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6253 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6254 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6255 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6256
6258}
6259
6260static bool isSplatShuffle(Value *V) {
6261 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6262 return all_equal(Shuf->getShuffleMask());
6263 return false;
6264}
6265
6266/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6267/// or upper half of the vector elements.
6268static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6269 bool AllowSplat = false) {
6270 // Scalable types can't be extract shuffle vectors.
6271 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6272 return false;
6273
6274 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6275 auto *FullTy = FullV->getType();
6276 auto *HalfTy = HalfV->getType();
6277 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6278 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6279 };
6280
6281 auto extractHalf = [](Value *FullV, Value *HalfV) {
6282 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6283 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6284 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6285 };
6286
6287 ArrayRef<int> M1, M2;
6288 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6289 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6290 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6291 return false;
6292
6293 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6294 // it is not checked as an extract below.
6295 if (AllowSplat && isSplatShuffle(Op1))
6296 S1Op1 = nullptr;
6297 if (AllowSplat && isSplatShuffle(Op2))
6298 S2Op1 = nullptr;
6299
6300 // Check that the operands are half as wide as the result and we extract
6301 // half of the elements of the input vectors.
6302 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6303 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6304 return false;
6305
6306 // Check the mask extracts either the lower or upper half of vector
6307 // elements.
6308 int M1Start = 0;
6309 int M2Start = 0;
6310 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6311 if ((S1Op1 &&
6312 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6313 (S2Op1 &&
6314 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6315 return false;
6316
6317 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6318 (M2Start != 0 && M2Start != (NumElements / 2)))
6319 return false;
6320 if (S1Op1 && S2Op1 && M1Start != M2Start)
6321 return false;
6322
6323 return true;
6324}
6325
6326/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6327/// of the vector elements.
6328static bool areExtractExts(Value *Ext1, Value *Ext2) {
6329 auto areExtDoubled = [](Instruction *Ext) {
6330 return Ext->getType()->getScalarSizeInBits() ==
6331 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6332 };
6333
6334 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6335 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6336 !areExtDoubled(cast<Instruction>(Ext1)) ||
6337 !areExtDoubled(cast<Instruction>(Ext2)))
6338 return false;
6339
6340 return true;
6341}
6342
6343/// Check if Op could be used with vmull_high_p64 intrinsic.
6345 Value *VectorOperand = nullptr;
6346 ConstantInt *ElementIndex = nullptr;
6347 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6348 m_ConstantInt(ElementIndex))) &&
6349 ElementIndex->getValue() == 1 &&
6350 isa<FixedVectorType>(VectorOperand->getType()) &&
6351 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6352}
6353
6354/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6355static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6357}
6358
6360 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6361 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6362 if (!GEP || GEP->getNumOperands() != 2)
6363 return false;
6364
6365 Value *Base = GEP->getOperand(0);
6366 Value *Offsets = GEP->getOperand(1);
6367
6368 // We only care about scalar_base+vector_offsets.
6369 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6370 return false;
6371
6372 // Sink extends that would allow us to use 32-bit offset vectors.
6373 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6374 auto *OffsetsInst = cast<Instruction>(Offsets);
6375 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6376 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6377 Ops.push_back(&GEP->getOperandUse(1));
6378 }
6379
6380 // Sink the GEP.
6381 return true;
6382}
6383
6384/// We want to sink following cases:
6385/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6386/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6388 if (match(Op, m_VScale()))
6389 return true;
6390 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6392 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6393 return true;
6394 }
6395 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6397 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6398 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6399 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6400 return true;
6401 }
6402 return false;
6403}
6404
6405/// Check if sinking \p I's operands to I's basic block is profitable, because
6406/// the operands can be folded into a target instruction, e.g.
6407/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6411 switch (II->getIntrinsicID()) {
6412 case Intrinsic::aarch64_neon_smull:
6413 case Intrinsic::aarch64_neon_umull:
6414 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6415 /*AllowSplat=*/true)) {
6416 Ops.push_back(&II->getOperandUse(0));
6417 Ops.push_back(&II->getOperandUse(1));
6418 return true;
6419 }
6420 [[fallthrough]];
6421
6422 case Intrinsic::fma:
6423 case Intrinsic::fmuladd:
6424 if (isa<VectorType>(I->getType()) &&
6425 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6426 !ST->hasFullFP16())
6427 return false;
6428 [[fallthrough]];
6429 case Intrinsic::aarch64_neon_sqdmull:
6430 case Intrinsic::aarch64_neon_sqdmulh:
6431 case Intrinsic::aarch64_neon_sqrdmulh:
6432 // Sink splats for index lane variants
6433 if (isSplatShuffle(II->getOperand(0)))
6434 Ops.push_back(&II->getOperandUse(0));
6435 if (isSplatShuffle(II->getOperand(1)))
6436 Ops.push_back(&II->getOperandUse(1));
6437 return !Ops.empty();
6438 case Intrinsic::aarch64_neon_fmlal:
6439 case Intrinsic::aarch64_neon_fmlal2:
6440 case Intrinsic::aarch64_neon_fmlsl:
6441 case Intrinsic::aarch64_neon_fmlsl2:
6442 // Sink splats for index lane variants
6443 if (isSplatShuffle(II->getOperand(1)))
6444 Ops.push_back(&II->getOperandUse(1));
6445 if (isSplatShuffle(II->getOperand(2)))
6446 Ops.push_back(&II->getOperandUse(2));
6447 return !Ops.empty();
6448 case Intrinsic::aarch64_sve_ptest_first:
6449 case Intrinsic::aarch64_sve_ptest_last:
6450 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6451 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6452 Ops.push_back(&II->getOperandUse(0));
6453 return !Ops.empty();
6454 case Intrinsic::aarch64_sme_write_horiz:
6455 case Intrinsic::aarch64_sme_write_vert:
6456 case Intrinsic::aarch64_sme_writeq_horiz:
6457 case Intrinsic::aarch64_sme_writeq_vert: {
6458 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6459 if (!Idx || Idx->getOpcode() != Instruction::Add)
6460 return false;
6461 Ops.push_back(&II->getOperandUse(1));
6462 return true;
6463 }
6464 case Intrinsic::aarch64_sme_read_horiz:
6465 case Intrinsic::aarch64_sme_read_vert:
6466 case Intrinsic::aarch64_sme_readq_horiz:
6467 case Intrinsic::aarch64_sme_readq_vert:
6468 case Intrinsic::aarch64_sme_ld1b_vert:
6469 case Intrinsic::aarch64_sme_ld1h_vert:
6470 case Intrinsic::aarch64_sme_ld1w_vert:
6471 case Intrinsic::aarch64_sme_ld1d_vert:
6472 case Intrinsic::aarch64_sme_ld1q_vert:
6473 case Intrinsic::aarch64_sme_st1b_vert:
6474 case Intrinsic::aarch64_sme_st1h_vert:
6475 case Intrinsic::aarch64_sme_st1w_vert:
6476 case Intrinsic::aarch64_sme_st1d_vert:
6477 case Intrinsic::aarch64_sme_st1q_vert:
6478 case Intrinsic::aarch64_sme_ld1b_horiz:
6479 case Intrinsic::aarch64_sme_ld1h_horiz:
6480 case Intrinsic::aarch64_sme_ld1w_horiz:
6481 case Intrinsic::aarch64_sme_ld1d_horiz:
6482 case Intrinsic::aarch64_sme_ld1q_horiz:
6483 case Intrinsic::aarch64_sme_st1b_horiz:
6484 case Intrinsic::aarch64_sme_st1h_horiz:
6485 case Intrinsic::aarch64_sme_st1w_horiz:
6486 case Intrinsic::aarch64_sme_st1d_horiz:
6487 case Intrinsic::aarch64_sme_st1q_horiz: {
6488 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6489 if (!Idx || Idx->getOpcode() != Instruction::Add)
6490 return false;
6491 Ops.push_back(&II->getOperandUse(3));
6492 return true;
6493 }
6494 case Intrinsic::aarch64_neon_pmull:
6495 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6496 return false;
6497 Ops.push_back(&II->getOperandUse(0));
6498 Ops.push_back(&II->getOperandUse(1));
6499 return true;
6500 case Intrinsic::aarch64_neon_pmull64:
6501 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6502 II->getArgOperand(1)))
6503 return false;
6504 Ops.push_back(&II->getArgOperandUse(0));
6505 Ops.push_back(&II->getArgOperandUse(1));
6506 return true;
6507 case Intrinsic::masked_gather:
6508 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6509 return false;
6510 Ops.push_back(&II->getArgOperandUse(0));
6511 return true;
6512 case Intrinsic::masked_scatter:
6513 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6514 return false;
6515 Ops.push_back(&II->getArgOperandUse(1));
6516 return true;
6517 default:
6518 return false;
6519 }
6520 }
6521
6522 auto ShouldSinkCondition = [](Value *Cond,
6523 SmallVectorImpl<Use *> &Ops) -> bool {
6525 return false;
6527 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6528 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6529 return false;
6530 if (isa<CmpInst>(II->getOperand(0)))
6531 Ops.push_back(&II->getOperandUse(0));
6532 return true;
6533 };
6534
6535 switch (I->getOpcode()) {
6536 case Instruction::GetElementPtr:
6537 case Instruction::Add:
6538 case Instruction::Sub:
6539 // Sink vscales closer to uses for better isel
6540 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6541 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6542 Ops.push_back(&I->getOperandUse(Op));
6543 return true;
6544 }
6545 }
6546 break;
6547 case Instruction::Select: {
6548 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6549 return false;
6550
6551 Ops.push_back(&I->getOperandUse(0));
6552 return true;
6553 }
6554 case Instruction::Br: {
6555 if (cast<BranchInst>(I)->isUnconditional())
6556 return false;
6557
6558 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6559 return false;
6560
6561 Ops.push_back(&I->getOperandUse(0));
6562 return true;
6563 }
6564 default:
6565 break;
6566 }
6567
6568 if (!I->getType()->isVectorTy())
6569 return false;
6570
6571 switch (I->getOpcode()) {
6572 case Instruction::Sub:
6573 case Instruction::Add: {
6574 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6575 return false;
6576
6577 // If the exts' operands extract either the lower or upper elements, we
6578 // can sink them too.
6579 auto Ext1 = cast<Instruction>(I->getOperand(0));
6580 auto Ext2 = cast<Instruction>(I->getOperand(1));
6581 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6582 Ops.push_back(&Ext1->getOperandUse(0));
6583 Ops.push_back(&Ext2->getOperandUse(0));
6584 }
6585
6586 Ops.push_back(&I->getOperandUse(0));
6587 Ops.push_back(&I->getOperandUse(1));
6588
6589 return true;
6590 }
6591 case Instruction::Or: {
6592 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6593 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6594 if (ST->hasNEON()) {
6595 Instruction *OtherAnd, *IA, *IB;
6596 Value *MaskValue;
6597 // MainAnd refers to And instruction that has 'Not' as one of its operands
6598 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6599 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6600 m_Instruction(IA)))))) {
6601 if (match(OtherAnd,
6602 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6603 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6604 ? cast<Instruction>(I->getOperand(1))
6605 : cast<Instruction>(I->getOperand(0));
6606
6607 // Both Ands should be in same basic block as Or
6608 if (I->getParent() != MainAnd->getParent() ||
6609 I->getParent() != OtherAnd->getParent())
6610 return false;
6611
6612 // Non-mask operands of both Ands should also be in same basic block
6613 if (I->getParent() != IA->getParent() ||
6614 I->getParent() != IB->getParent())
6615 return false;
6616
6617 Ops.push_back(
6618 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6619 Ops.push_back(&I->getOperandUse(0));
6620 Ops.push_back(&I->getOperandUse(1));
6621
6622 return true;
6623 }
6624 }
6625 }
6626
6627 return false;
6628 }
6629 case Instruction::Mul: {
6630 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6631 auto *Ty = cast<VectorType>(V->getType());
6632 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6633 if (Ty->isScalableTy())
6634 return false;
6635
6636 // Indexed variants of Mul exist for i16 and i32 element types only.
6637 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6638 };
6639
6640 int NumZExts = 0, NumSExts = 0;
6641 for (auto &Op : I->operands()) {
6642 // Make sure we are not already sinking this operand
6643 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6644 continue;
6645
6646 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6647 auto *Ext = cast<Instruction>(Op);
6648 auto *ExtOp = Ext->getOperand(0);
6649 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6650 Ops.push_back(&Ext->getOperandUse(0));
6651 Ops.push_back(&Op);
6652
6653 if (isa<SExtInst>(Ext))
6654 NumSExts++;
6655 else
6656 NumZExts++;
6657
6658 continue;
6659 }
6660
6662 if (!Shuffle)
6663 continue;
6664
6665 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6666 // operand and the s/zext can help create indexed s/umull. This is
6667 // especially useful to prevent i64 mul being scalarized.
6668 if (isSplatShuffle(Shuffle) &&
6669 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6670 Ops.push_back(&Shuffle->getOperandUse(0));
6671 Ops.push_back(&Op);
6672 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6673 NumSExts++;
6674 else
6675 NumZExts++;
6676 continue;
6677 }
6678
6679 Value *ShuffleOperand = Shuffle->getOperand(0);
6680 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6681 if (!Insert)
6682 continue;
6683
6684 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6685 if (!OperandInstr)
6686 continue;
6687
6688 ConstantInt *ElementConstant =
6689 dyn_cast<ConstantInt>(Insert->getOperand(2));
6690 // Check that the insertelement is inserting into element 0
6691 if (!ElementConstant || !ElementConstant->isZero())
6692 continue;
6693
6694 unsigned Opcode = OperandInstr->getOpcode();
6695 if (Opcode == Instruction::SExt)
6696 NumSExts++;
6697 else if (Opcode == Instruction::ZExt)
6698 NumZExts++;
6699 else {
6700 // If we find that the top bits are known 0, then we can sink and allow
6701 // the backend to generate a umull.
6702 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6703 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6704 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6705 continue;
6706 NumZExts++;
6707 }
6708
6709 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6710 // the And, just to hoist it again back to the load.
6711 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6712 Ops.push_back(&Insert->getOperandUse(1));
6713 Ops.push_back(&Shuffle->getOperandUse(0));
6714 Ops.push_back(&Op);
6715 }
6716
6717 // It is profitable to sink if we found two of the same type of extends.
6718 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6719 return true;
6720
6721 // Otherwise, see if we should sink splats for indexed variants.
6722 if (!ShouldSinkSplatForIndexedVariant(I))
6723 return false;
6724
6725 Ops.clear();
6726 if (isSplatShuffle(I->getOperand(0)))
6727 Ops.push_back(&I->getOperandUse(0));
6728 if (isSplatShuffle(I->getOperand(1)))
6729 Ops.push_back(&I->getOperandUse(1));
6730
6731 return !Ops.empty();
6732 }
6733 case Instruction::FMul: {
6734 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6735 if (I->getType()->isScalableTy())
6736 return false;
6737
6738 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6739 !ST->hasFullFP16())
6740 return false;
6741
6742 // Sink splats for index lane variants
6743 if (isSplatShuffle(I->getOperand(0)))
6744 Ops.push_back(&I->getOperandUse(0));
6745 if (isSplatShuffle(I->getOperand(1)))
6746 Ops.push_back(&I->getOperandUse(1));
6747 return !Ops.empty();
6748 }
6749 default:
6750 return false;
6751 }
6752 return false;
6753}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
unsigned countLeadingOnes() const
Definition APInt.h:1624
void negate()
Negate this APInt in place.
Definition APInt.h:1468
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:760
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:31
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:40
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const DataLayout & getDataLayout() const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:347
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:217
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...