LLVM 22.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 StringRef AttributeStr =
257 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
258 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261 return AArch64::getFMVPriority(Features);
262}
263
265 return F.hasFnAttribute("fmv-features");
266}
267
268const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
269 AArch64::FeatureExecuteOnly,
270};
271
273 const Function *Callee) const {
274 SMECallAttrs CallAttrs(*Caller, *Callee);
275
276 // Never inline a function explicitly marked as being streaming,
277 // into a non-streaming function. Assume it was marked as streaming
278 // for a reason.
279 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
281 return false;
282
283 // When inlining, we should consider the body of the function, not the
284 // interface.
285 if (CallAttrs.callee().hasStreamingBody()) {
286 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
287 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
288 }
289
290 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
291 return false;
292
293 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
294 CallAttrs.requiresPreservingZT0() ||
295 CallAttrs.requiresPreservingAllZAState()) {
296 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
297 return false;
298 }
299
300 const TargetMachine &TM = getTLI()->getTargetMachine();
301 const FeatureBitset &CallerBits =
302 TM.getSubtargetImpl(*Caller)->getFeatureBits();
303 const FeatureBitset &CalleeBits =
304 TM.getSubtargetImpl(*Callee)->getFeatureBits();
305 // Adjust the feature bitsets by inverting some of the bits. This is needed
306 // for target features that represent restrictions rather than capabilities,
307 // for example a "+execute-only" callee can be inlined into a caller without
308 // "+execute-only", but not vice versa.
309 FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
310 FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
311
312 return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
313}
314
316 const Function *Callee,
317 ArrayRef<Type *> Types) const {
318 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
319 return false;
320
321 // We need to ensure that argument promotion does not attempt to promote
322 // pointers to fixed-length vector types larger than 128 bits like
323 // <8 x float> (and pointers to aggregate types which have such fixed-length
324 // vector type members) into the values of the pointees. Such vector types
325 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
326 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
327 // types can be safely treated as 128-bit NEON types and they cannot be
328 // distinguished in IR.
329 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
330 auto FVTy = dyn_cast<FixedVectorType>(Ty);
331 return FVTy &&
332 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
333 }))
334 return false;
335
336 return true;
337}
338
339unsigned
341 unsigned DefaultCallPenalty) const {
342 // This function calculates a penalty for executing Call in F.
343 //
344 // There are two ways this function can be called:
345 // (1) F:
346 // call from F -> G (the call here is Call)
347 //
348 // For (1), Call.getCaller() == F, so it will always return a high cost if
349 // a streaming-mode change is required (thus promoting the need to inline the
350 // function)
351 //
352 // (2) F:
353 // call from F -> G (the call here is not Call)
354 // G:
355 // call from G -> H (the call here is Call)
356 //
357 // For (2), if after inlining the body of G into F the call to H requires a
358 // streaming-mode change, and the call to G from F would also require a
359 // streaming-mode change, then there is benefit to do the streaming-mode
360 // change only once and avoid inlining of G into F.
361
362 SMEAttrs FAttrs(*F);
363 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
364
365 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
366 if (F == Call.getCaller()) // (1)
367 return CallPenaltyChangeSM * DefaultCallPenalty;
368 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
369 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
370 }
371
372 return DefaultCallPenalty;
373}
374
378
379 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
380 return true;
381
383 ST->isSVEorStreamingSVEAvailable() &&
384 !ST->disableMaximizeScalableBandwidth();
385}
386
387/// Calculate the cost of materializing a 64-bit value. This helper
388/// method might only calculate a fraction of a larger immediate. Therefore it
389/// is valid to return a cost of ZERO.
391 // Check if the immediate can be encoded within an instruction.
392 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
393 return 0;
394
395 if (Val < 0)
396 Val = ~Val;
397
398 // Calculate how many moves we will need to materialize this constant.
400 AArch64_IMM::expandMOVImm(Val, 64, Insn);
401 return Insn.size();
402}
403
404/// Calculate the cost of materializing the given constant.
408 assert(Ty->isIntegerTy());
409
410 unsigned BitSize = Ty->getPrimitiveSizeInBits();
411 if (BitSize == 0)
412 return ~0U;
413
414 // Sign-extend all constants to a multiple of 64-bit.
415 APInt ImmVal = Imm;
416 if (BitSize & 0x3f)
417 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
418
419 // Split the constant into 64-bit chunks and calculate the cost for each
420 // chunk.
422 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
423 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
424 int64_t Val = Tmp.getSExtValue();
425 Cost += getIntImmCost(Val);
426 }
427 // We need at least one instruction to materialze the constant.
428 return std::max<InstructionCost>(1, Cost);
429}
430
432 const APInt &Imm, Type *Ty,
434 Instruction *Inst) const {
435 assert(Ty->isIntegerTy());
436
437 unsigned BitSize = Ty->getPrimitiveSizeInBits();
438 // There is no cost model for constants with a bit size of 0. Return TCC_Free
439 // here, so that constant hoisting will ignore this constant.
440 if (BitSize == 0)
441 return TTI::TCC_Free;
442
443 unsigned ImmIdx = ~0U;
444 switch (Opcode) {
445 default:
446 return TTI::TCC_Free;
447 case Instruction::GetElementPtr:
448 // Always hoist the base address of a GetElementPtr.
449 if (Idx == 0)
450 return 2 * TTI::TCC_Basic;
451 return TTI::TCC_Free;
452 case Instruction::Store:
453 ImmIdx = 0;
454 break;
455 case Instruction::Add:
456 case Instruction::Sub:
457 case Instruction::Mul:
458 case Instruction::UDiv:
459 case Instruction::SDiv:
460 case Instruction::URem:
461 case Instruction::SRem:
462 case Instruction::And:
463 case Instruction::Or:
464 case Instruction::Xor:
465 case Instruction::ICmp:
466 ImmIdx = 1;
467 break;
468 // Always return TCC_Free for the shift value of a shift instruction.
469 case Instruction::Shl:
470 case Instruction::LShr:
471 case Instruction::AShr:
472 if (Idx == 1)
473 return TTI::TCC_Free;
474 break;
475 case Instruction::Trunc:
476 case Instruction::ZExt:
477 case Instruction::SExt:
478 case Instruction::IntToPtr:
479 case Instruction::PtrToInt:
480 case Instruction::BitCast:
481 case Instruction::PHI:
482 case Instruction::Call:
483 case Instruction::Select:
484 case Instruction::Ret:
485 case Instruction::Load:
486 break;
487 }
488
489 if (Idx == ImmIdx) {
490 int NumConstants = (BitSize + 63) / 64;
492 return (Cost <= NumConstants * TTI::TCC_Basic)
493 ? static_cast<int>(TTI::TCC_Free)
494 : Cost;
495 }
497}
498
501 const APInt &Imm, Type *Ty,
503 assert(Ty->isIntegerTy());
504
505 unsigned BitSize = Ty->getPrimitiveSizeInBits();
506 // There is no cost model for constants with a bit size of 0. Return TCC_Free
507 // here, so that constant hoisting will ignore this constant.
508 if (BitSize == 0)
509 return TTI::TCC_Free;
510
511 // Most (all?) AArch64 intrinsics do not support folding immediates into the
512 // selected instruction, so we compute the materialization cost for the
513 // immediate directly.
514 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
516
517 switch (IID) {
518 default:
519 return TTI::TCC_Free;
520 case Intrinsic::sadd_with_overflow:
521 case Intrinsic::uadd_with_overflow:
522 case Intrinsic::ssub_with_overflow:
523 case Intrinsic::usub_with_overflow:
524 case Intrinsic::smul_with_overflow:
525 case Intrinsic::umul_with_overflow:
526 if (Idx == 1) {
527 int NumConstants = (BitSize + 63) / 64;
529 return (Cost <= NumConstants * TTI::TCC_Basic)
530 ? static_cast<int>(TTI::TCC_Free)
531 : Cost;
532 }
533 break;
534 case Intrinsic::experimental_stackmap:
535 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 case Intrinsic::experimental_patchpoint_void:
539 case Intrinsic::experimental_patchpoint:
540 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
541 return TTI::TCC_Free;
542 break;
543 case Intrinsic::experimental_gc_statepoint:
544 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
545 return TTI::TCC_Free;
546 break;
547 }
549}
550
552AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
553 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
554 if (TyWidth == 32 || TyWidth == 64)
556 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
557 return TTI::PSK_Software;
558}
559
560static bool isUnpackedVectorVT(EVT VecVT) {
561 return VecVT.isScalableVector() &&
563}
564
566 const IntrinsicCostAttributes &ICA) {
567 // We need to know at least the number of elements in the vector of buckets
568 // and the size of each element to update.
569 if (ICA.getArgTypes().size() < 2)
571
572 // Only interested in costing for the hardware instruction from SVE2.
573 if (!ST->hasSVE2())
575
576 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
577 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
578 unsigned TotalHistCnts = 1;
579
580 unsigned EltSize = EltTy->getScalarSizeInBits();
581 // Only allow (up to 64b) integers or pointers
582 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
584
585 // FIXME: We should be able to generate histcnt for fixed-length vectors
586 // using ptrue with a specific VL.
587 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
588 unsigned EC = VTy->getElementCount().getKnownMinValue();
589 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
591
592 // HistCnt only supports 32b and 64b element types
593 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
594
595 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
597
598 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
599 TotalHistCnts = EC / NaturalVectorWidth;
600
601 return InstructionCost(BaseHistCntCost * TotalHistCnts);
602 }
603
605}
606
610 // The code-generator is currently not able to handle scalable vectors
611 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
612 // it. This change will be removed when code-generation for these types is
613 // sufficiently reliable.
614 auto *RetTy = ICA.getReturnType();
615 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
616 if (VTy->getElementCount() == ElementCount::getScalable(1))
618
619 switch (ICA.getID()) {
620 case Intrinsic::experimental_vector_histogram_add: {
621 InstructionCost HistCost = getHistogramCost(ST, ICA);
622 // If the cost isn't valid, we may still be able to scalarize
623 if (HistCost.isValid())
624 return HistCost;
625 break;
626 }
627 case Intrinsic::umin:
628 case Intrinsic::umax:
629 case Intrinsic::smin:
630 case Intrinsic::smax: {
631 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
632 MVT::v8i16, MVT::v2i32, MVT::v4i32,
633 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
634 MVT::nxv2i64};
635 auto LT = getTypeLegalizationCost(RetTy);
636 // v2i64 types get converted to cmp+bif hence the cost of 2
637 if (LT.second == MVT::v2i64)
638 return LT.first * 2;
639 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
640 return LT.first;
641 break;
642 }
643 case Intrinsic::sadd_sat:
644 case Intrinsic::ssub_sat:
645 case Intrinsic::uadd_sat:
646 case Intrinsic::usub_sat: {
647 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
648 MVT::v8i16, MVT::v2i32, MVT::v4i32,
649 MVT::v2i64};
650 auto LT = getTypeLegalizationCost(RetTy);
651 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
652 // need to extend the type, as it uses shr(qadd(shl, shl)).
653 unsigned Instrs =
654 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
655 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
656 return LT.first * Instrs;
657
659 uint64_t VectorSize = TS.getKnownMinValue();
660
661 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
662 return LT.first * Instrs;
663
664 break;
665 }
666 case Intrinsic::abs: {
667 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
668 MVT::v8i16, MVT::v2i32, MVT::v4i32,
669 MVT::v2i64};
670 auto LT = getTypeLegalizationCost(RetTy);
671 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
672 return LT.first;
673 break;
674 }
675 case Intrinsic::bswap: {
676 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
677 MVT::v4i32, MVT::v2i64};
678 auto LT = getTypeLegalizationCost(RetTy);
679 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
680 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
681 return LT.first;
682 break;
683 }
684 case Intrinsic::fma:
685 case Intrinsic::fmuladd: {
686 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
687 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
688 Type *EltTy = RetTy->getScalarType();
689 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
690 (EltTy->isHalfTy() && ST->hasFullFP16()))
691 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
692 break;
693 }
694 case Intrinsic::stepvector: {
695 InstructionCost Cost = 1; // Cost of the `index' instruction
696 auto LT = getTypeLegalizationCost(RetTy);
697 // Legalisation of illegal vectors involves an `index' instruction plus
698 // (LT.first - 1) vector adds.
699 if (LT.first > 1) {
700 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
701 InstructionCost AddCost =
702 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
703 Cost += AddCost * (LT.first - 1);
704 }
705 return Cost;
706 }
707 case Intrinsic::vector_extract:
708 case Intrinsic::vector_insert: {
709 // If both the vector and subvector types are legal types and the index
710 // is 0, then this should be a no-op or simple operation; return a
711 // relatively low cost.
712
713 // If arguments aren't actually supplied, then we cannot determine the
714 // value of the index. We also want to skip predicate types.
715 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
717 break;
718
719 LLVMContext &C = RetTy->getContext();
720 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
721 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
722 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
723 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
724 // Skip this if either the vector or subvector types are unpacked
725 // SVE types; they may get lowered to stack stores and loads.
726 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
727 break;
728
730 getTLI()->getTypeConversion(C, SubVecVT);
732 getTLI()->getTypeConversion(C, VecVT);
733 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
734 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
735 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
736 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
737 return TTI::TCC_Free;
738 break;
739 }
740 case Intrinsic::bitreverse: {
741 static const CostTblEntry BitreverseTbl[] = {
742 {Intrinsic::bitreverse, MVT::i32, 1},
743 {Intrinsic::bitreverse, MVT::i64, 1},
744 {Intrinsic::bitreverse, MVT::v8i8, 1},
745 {Intrinsic::bitreverse, MVT::v16i8, 1},
746 {Intrinsic::bitreverse, MVT::v4i16, 2},
747 {Intrinsic::bitreverse, MVT::v8i16, 2},
748 {Intrinsic::bitreverse, MVT::v2i32, 2},
749 {Intrinsic::bitreverse, MVT::v4i32, 2},
750 {Intrinsic::bitreverse, MVT::v1i64, 2},
751 {Intrinsic::bitreverse, MVT::v2i64, 2},
752 };
753 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
754 const auto *Entry =
755 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
756 if (Entry) {
757 // Cost Model is using the legal type(i32) that i8 and i16 will be
758 // converted to +1 so that we match the actual lowering cost
759 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
760 TLI->getValueType(DL, RetTy, true) == MVT::i16)
761 return LegalisationCost.first * Entry->Cost + 1;
762
763 return LegalisationCost.first * Entry->Cost;
764 }
765 break;
766 }
767 case Intrinsic::ctpop: {
768 if (!ST->hasNEON()) {
769 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
770 return getTypeLegalizationCost(RetTy).first * 12;
771 }
772 static const CostTblEntry CtpopCostTbl[] = {
773 {ISD::CTPOP, MVT::v2i64, 4},
774 {ISD::CTPOP, MVT::v4i32, 3},
775 {ISD::CTPOP, MVT::v8i16, 2},
776 {ISD::CTPOP, MVT::v16i8, 1},
777 {ISD::CTPOP, MVT::i64, 4},
778 {ISD::CTPOP, MVT::v2i32, 3},
779 {ISD::CTPOP, MVT::v4i16, 2},
780 {ISD::CTPOP, MVT::v8i8, 1},
781 {ISD::CTPOP, MVT::i32, 5},
782 };
783 auto LT = getTypeLegalizationCost(RetTy);
784 MVT MTy = LT.second;
785 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
786 // Extra cost of +1 when illegal vector types are legalized by promoting
787 // the integer type.
788 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
789 RetTy->getScalarSizeInBits()
790 ? 1
791 : 0;
792 return LT.first * Entry->Cost + ExtraCost;
793 }
794 break;
795 }
796 case Intrinsic::sadd_with_overflow:
797 case Intrinsic::uadd_with_overflow:
798 case Intrinsic::ssub_with_overflow:
799 case Intrinsic::usub_with_overflow:
800 case Intrinsic::smul_with_overflow:
801 case Intrinsic::umul_with_overflow: {
802 static const CostTblEntry WithOverflowCostTbl[] = {
803 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
804 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
805 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
806 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
807 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
808 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
809 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
810 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
811 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
812 {Intrinsic::usub_with_overflow, MVT::i8, 3},
813 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
814 {Intrinsic::usub_with_overflow, MVT::i16, 3},
815 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
816 {Intrinsic::usub_with_overflow, MVT::i32, 1},
817 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
818 {Intrinsic::usub_with_overflow, MVT::i64, 1},
819 {Intrinsic::smul_with_overflow, MVT::i8, 5},
820 {Intrinsic::umul_with_overflow, MVT::i8, 4},
821 {Intrinsic::smul_with_overflow, MVT::i16, 5},
822 {Intrinsic::umul_with_overflow, MVT::i16, 4},
823 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
824 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
825 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
826 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
827 };
828 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
829 if (MTy.isSimple())
830 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
831 MTy.getSimpleVT()))
832 return Entry->Cost;
833 break;
834 }
835 case Intrinsic::fptosi_sat:
836 case Intrinsic::fptoui_sat: {
837 if (ICA.getArgTypes().empty())
838 break;
839 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
840 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
841 EVT MTy = TLI->getValueType(DL, RetTy);
842 // Check for the legal types, which are where the size of the input and the
843 // output are the same, or we are using cvt f64->i32 or f32->i64.
844 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
845 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
846 LT.second == MVT::v2f64)) {
847 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
848 (LT.second == MVT::f64 && MTy == MVT::i32) ||
849 (LT.second == MVT::f32 && MTy == MVT::i64)))
850 return LT.first;
851 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
852 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
853 MTy.getScalarSizeInBits() == 64)
854 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
855 }
856 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
857 // f32.
858 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
859 return LT.first + getIntrinsicInstrCost(
860 {ICA.getID(),
861 RetTy,
862 {ICA.getArgTypes()[0]->getWithNewType(
863 Type::getFloatTy(RetTy->getContext()))}},
864 CostKind);
865 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
866 (LT.second == MVT::f16 && MTy == MVT::i64) ||
867 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
868 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
869 return LT.first;
870 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
871 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
872 MTy.getScalarSizeInBits() == 32)
873 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
874 // Extending vector types v8f16->v8i32. These current scalarize but the
875 // codegen could be better.
876 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
877 MTy.getScalarSizeInBits() == 64)
878 return MTy.getVectorNumElements() * 3;
879
880 // If we can we use a legal convert followed by a min+max
881 if ((LT.second.getScalarType() == MVT::f32 ||
882 LT.second.getScalarType() == MVT::f64 ||
883 LT.second.getScalarType() == MVT::f16) &&
884 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
885 Type *LegalTy =
886 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
887 if (LT.second.isVector())
888 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
890 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
891 LegalTy, {LegalTy, LegalTy});
893 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
894 LegalTy, {LegalTy, LegalTy});
896 return LT.first * Cost +
897 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
898 : 1);
899 }
900 // Otherwise we need to follow the default expansion that clamps the value
901 // using a float min/max with a fcmp+sel for nan handling when signed.
902 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
903 RetTy = RetTy->getScalarType();
904 if (LT.second.isVector()) {
905 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
906 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
907 }
908 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
910 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
912 Cost +=
913 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
915 if (IsSigned) {
916 Type *CondTy = RetTy->getWithNewBitWidth(1);
917 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
919 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
921 }
922 return LT.first * Cost;
923 }
924 case Intrinsic::fshl:
925 case Intrinsic::fshr: {
926 if (ICA.getArgs().empty())
927 break;
928
929 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
930
931 // ROTR / ROTL is a funnel shift with equal first and second operand. For
932 // ROTR on integer registers (i32/i64) this can be done in a single ror
933 // instruction. A fshl with a non-constant shift uses a neg + ror.
934 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
935 (RetTy->getPrimitiveSizeInBits() == 32 ||
936 RetTy->getPrimitiveSizeInBits() == 64)) {
937 InstructionCost NegCost =
938 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
939 return 1 + NegCost;
940 }
941
942 // TODO: Add handling for fshl where third argument is not a constant.
943 if (!OpInfoZ.isConstant())
944 break;
945
946 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
947 if (OpInfoZ.isUniform()) {
948 static const CostTblEntry FshlTbl[] = {
949 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
950 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
951 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
952 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
953 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
954 // to avoid having to duplicate the costs.
955 const auto *Entry =
956 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
957 if (Entry)
958 return LegalisationCost.first * Entry->Cost;
959 }
960
961 auto TyL = getTypeLegalizationCost(RetTy);
962 if (!RetTy->isIntegerTy())
963 break;
964
965 // Estimate cost manually, as types like i8 and i16 will get promoted to
966 // i32 and CostTableLookup will ignore the extra conversion cost.
967 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
968 RetTy->getScalarSizeInBits() < 64) ||
969 (RetTy->getScalarSizeInBits() % 64 != 0);
970 unsigned ExtraCost = HigherCost ? 1 : 0;
971 if (RetTy->getScalarSizeInBits() == 32 ||
972 RetTy->getScalarSizeInBits() == 64)
973 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
974 // extr instruction.
975 else if (HigherCost)
976 ExtraCost = 1;
977 else
978 break;
979 return TyL.first + ExtraCost;
980 }
981 case Intrinsic::get_active_lane_mask: {
982 auto RetTy = cast<VectorType>(ICA.getReturnType());
983 EVT RetVT = getTLI()->getValueType(DL, RetTy);
984 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
985 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
986 break;
987
988 if (RetTy->isScalableTy()) {
989 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
991 break;
992
993 auto LT = getTypeLegalizationCost(RetTy);
994 InstructionCost Cost = LT.first;
995 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
996 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
997 // nxv32i1 = get_active_lane_mask(base, idx) ->
998 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
999 if (ST->hasSVE2p1() || ST->hasSME2()) {
1000 Cost /= 2;
1001 if (Cost == 1)
1002 return Cost;
1003 }
1004
1005 // If more than one whilelo intrinsic is required, include the extra cost
1006 // required by the saturating add & select required to increment the
1007 // start value after the first intrinsic call.
1008 Type *OpTy = ICA.getArgTypes()[0];
1009 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1010 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1011 Type *CondTy = OpTy->getWithNewBitWidth(1);
1012 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1014 return Cost + (SplitCost * (Cost - 1));
1015 } else if (!getTLI()->isTypeLegal(RetVT)) {
1016 // We don't have enough context at this point to determine if the mask
1017 // is going to be kept live after the block, which will force the vXi1
1018 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1019 // For now, we just assume the vectorizer created this intrinsic and
1020 // the result will be the input for a PHI. In this case the cost will
1021 // be extremely high for fixed-width vectors.
1022 // NOTE: getScalarizationOverhead returns a cost that's far too
1023 // pessimistic for the actual generated codegen. In reality there are
1024 // two instructions generated per lane.
1025 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1026 }
1027 break;
1028 }
1029 case Intrinsic::experimental_vector_match: {
1030 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1031 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1032 unsigned SearchSize = NeedleTy->getNumElements();
1033 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1034 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1035 // Neoverse V3, these are cheap operations with the same latency as a
1036 // vector ADD. In most cases, however, we also need to do an extra DUP.
1037 // For fixed-length vectors we currently need an extra five--six
1038 // instructions besides the MATCH.
1040 if (isa<FixedVectorType>(RetTy))
1041 Cost += 10;
1042 return Cost;
1043 }
1044 break;
1045 }
1046 case Intrinsic::experimental_cttz_elts: {
1047 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1048 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1049 // This will consist of a SVE brkb and a cntp instruction. These
1050 // typically have the same latency and half the throughput as a vector
1051 // add instruction.
1052 return 4;
1053 }
1054 break;
1055 }
1056 case Intrinsic::experimental_vector_extract_last_active:
1057 if (ST->isSVEorStreamingSVEAvailable()) {
1058 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1059 // This should turn into chained clastb instructions.
1060 return LegalCost;
1061 }
1062 break;
1063 default:
1064 break;
1065 }
1067}
1068
1069/// The function will remove redundant reinterprets casting in the presence
1070/// of the control flow
1071static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1072 IntrinsicInst &II) {
1074 auto RequiredType = II.getType();
1075
1076 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1077 assert(PN && "Expected Phi Node!");
1078
1079 // Don't create a new Phi unless we can remove the old one.
1080 if (!PN->hasOneUse())
1081 return std::nullopt;
1082
1083 for (Value *IncValPhi : PN->incoming_values()) {
1084 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1085 if (!Reinterpret ||
1086 Reinterpret->getIntrinsicID() !=
1087 Intrinsic::aarch64_sve_convert_to_svbool ||
1088 RequiredType != Reinterpret->getArgOperand(0)->getType())
1089 return std::nullopt;
1090 }
1091
1092 // Create the new Phi
1093 IC.Builder.SetInsertPoint(PN);
1094 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1095 Worklist.push_back(PN);
1096
1097 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1098 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1099 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1100 Worklist.push_back(Reinterpret);
1101 }
1102
1103 // Cleanup Phi Node and reinterprets
1104 return IC.replaceInstUsesWith(II, NPN);
1105}
1106
1107// A collection of properties common to SVE intrinsics that allow for combines
1108// to be written without needing to know the specific intrinsic.
1110 //
1111 // Helper routines for common intrinsic definitions.
1112 //
1113
1114 // e.g. llvm.aarch64.sve.add pg, op1, op2
1115 // with IID ==> llvm.aarch64.sve.add_u
1116 static SVEIntrinsicInfo
1123
1124 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1131
1132 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1138
1139 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1145
1146 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1147 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1148 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1149 return SVEIntrinsicInfo()
1152 }
1153
1154 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1155 // llvm.aarch64.sve.ld1 pg, ptr
1162
1163 // All properties relate to predication and thus having a general predicate
1164 // is the minimum requirement to say there is intrinsic info to act on.
1165 explicit operator bool() const { return hasGoverningPredicate(); }
1166
1167 //
1168 // Properties relating to the governing predicate.
1169 //
1170
1172 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1173 }
1174
1176 assert(hasGoverningPredicate() && "Propery not set!");
1177 return GoverningPredicateIdx;
1178 }
1179
1181 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1182 GoverningPredicateIdx = Index;
1183 return *this;
1184 }
1185
1186 //
1187 // Properties relating to operations the intrinsic could be transformed into.
1188 // NOTE: This does not mean such a transformation is always possible, but the
1189 // knowledge makes it possible to reuse existing optimisations without needing
1190 // to embed specific handling for each intrinsic. For example, instruction
1191 // simplification can be used to optimise an intrinsic's active lanes.
1192 //
1193
1195 return UndefIntrinsic != Intrinsic::not_intrinsic;
1196 }
1197
1199 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1200 return UndefIntrinsic;
1201 }
1202
1204 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1205 UndefIntrinsic = IID;
1206 return *this;
1207 }
1208
1209 bool hasMatchingIROpode() const { return IROpcode != 0; }
1210
1211 unsigned getMatchingIROpode() const {
1212 assert(hasMatchingIROpode() && "Propery not set!");
1213 return IROpcode;
1214 }
1215
1217 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1218 IROpcode = Opcode;
1219 return *this;
1220 }
1221
1222 //
1223 // Properties relating to the result of inactive lanes.
1224 //
1225
1227 return ResultLanes == InactiveLanesTakenFromOperand;
1228 }
1229
1231 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1232 return OperandIdxForInactiveLanes;
1233 }
1234
1236 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1237 ResultLanes = InactiveLanesTakenFromOperand;
1238 OperandIdxForInactiveLanes = Index;
1239 return *this;
1240 }
1241
1243 return ResultLanes == InactiveLanesAreNotDefined;
1244 }
1245
1247 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1248 ResultLanes = InactiveLanesAreNotDefined;
1249 return *this;
1250 }
1251
1253 return ResultLanes == InactiveLanesAreUnused;
1254 }
1255
1257 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1258 ResultLanes = InactiveLanesAreUnused;
1259 return *this;
1260 }
1261
1262 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1263 // inactiveLanesAreZeroed =
1264 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1265 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1266
1268 ResultIsZeroInitialized = true;
1269 return *this;
1270 }
1271
1272 //
1273 // The first operand of unary merging operations is typically only used to
1274 // set the result for inactive lanes. Knowing this allows us to deadcode the
1275 // operand when we can prove there are no inactive lanes.
1276 //
1277
1279 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1280 }
1281
1283 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1284 return OperandIdxWithNoActiveLanes;
1285 }
1286
1288 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1289 OperandIdxWithNoActiveLanes = Index;
1290 return *this;
1291 }
1292
1293private:
1294 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1295
1296 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1297 unsigned IROpcode = 0;
1298
1299 enum PredicationStyle {
1301 InactiveLanesTakenFromOperand,
1302 InactiveLanesAreNotDefined,
1303 InactiveLanesAreUnused
1304 } ResultLanes = Uninitialized;
1305
1306 bool ResultIsZeroInitialized = false;
1307 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1308 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1309};
1310
1312 // Some SVE intrinsics do not use scalable vector types, but since they are
1313 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1314 if (!isa<ScalableVectorType>(II.getType()) &&
1315 all_of(II.args(), [&](const Value *V) {
1316 return !isa<ScalableVectorType>(V->getType());
1317 }))
1318 return SVEIntrinsicInfo();
1319
1320 Intrinsic::ID IID = II.getIntrinsicID();
1321 switch (IID) {
1322 default:
1323 break;
1324 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1325 case Intrinsic::aarch64_sve_fcvt_f16f32:
1326 case Intrinsic::aarch64_sve_fcvt_f16f64:
1327 case Intrinsic::aarch64_sve_fcvt_f32f16:
1328 case Intrinsic::aarch64_sve_fcvt_f32f64:
1329 case Intrinsic::aarch64_sve_fcvt_f64f16:
1330 case Intrinsic::aarch64_sve_fcvt_f64f32:
1331 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1332 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1333 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1334 case Intrinsic::aarch64_sve_fcvtzs:
1335 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1336 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1337 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1338 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1339 case Intrinsic::aarch64_sve_fcvtzu:
1340 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1341 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1342 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1343 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1344 case Intrinsic::aarch64_sve_scvtf:
1345 case Intrinsic::aarch64_sve_scvtf_f16i32:
1346 case Intrinsic::aarch64_sve_scvtf_f16i64:
1347 case Intrinsic::aarch64_sve_scvtf_f32i64:
1348 case Intrinsic::aarch64_sve_scvtf_f64i32:
1349 case Intrinsic::aarch64_sve_ucvtf:
1350 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1351 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1352 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1353 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1355
1356 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1357 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1358 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1359 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1361
1362 case Intrinsic::aarch64_sve_fabd:
1363 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1364 case Intrinsic::aarch64_sve_fadd:
1365 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1366 .setMatchingIROpcode(Instruction::FAdd);
1367 case Intrinsic::aarch64_sve_fdiv:
1368 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1369 .setMatchingIROpcode(Instruction::FDiv);
1370 case Intrinsic::aarch64_sve_fmax:
1371 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1372 case Intrinsic::aarch64_sve_fmaxnm:
1373 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1374 case Intrinsic::aarch64_sve_fmin:
1375 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1376 case Intrinsic::aarch64_sve_fminnm:
1377 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1378 case Intrinsic::aarch64_sve_fmla:
1379 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1380 case Intrinsic::aarch64_sve_fmls:
1381 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1382 case Intrinsic::aarch64_sve_fmul:
1383 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1384 .setMatchingIROpcode(Instruction::FMul);
1385 case Intrinsic::aarch64_sve_fmulx:
1386 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1387 case Intrinsic::aarch64_sve_fnmla:
1388 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1389 case Intrinsic::aarch64_sve_fnmls:
1390 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1391 case Intrinsic::aarch64_sve_fsub:
1392 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1393 .setMatchingIROpcode(Instruction::FSub);
1394 case Intrinsic::aarch64_sve_add:
1395 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1396 .setMatchingIROpcode(Instruction::Add);
1397 case Intrinsic::aarch64_sve_mla:
1398 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1399 case Intrinsic::aarch64_sve_mls:
1400 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1401 case Intrinsic::aarch64_sve_mul:
1402 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1403 .setMatchingIROpcode(Instruction::Mul);
1404 case Intrinsic::aarch64_sve_sabd:
1405 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1406 case Intrinsic::aarch64_sve_sdiv:
1407 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1408 .setMatchingIROpcode(Instruction::SDiv);
1409 case Intrinsic::aarch64_sve_smax:
1410 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1411 case Intrinsic::aarch64_sve_smin:
1412 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1413 case Intrinsic::aarch64_sve_smulh:
1414 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1415 case Intrinsic::aarch64_sve_sub:
1416 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1417 .setMatchingIROpcode(Instruction::Sub);
1418 case Intrinsic::aarch64_sve_uabd:
1419 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1420 case Intrinsic::aarch64_sve_udiv:
1421 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1422 .setMatchingIROpcode(Instruction::UDiv);
1423 case Intrinsic::aarch64_sve_umax:
1424 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1425 case Intrinsic::aarch64_sve_umin:
1426 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1427 case Intrinsic::aarch64_sve_umulh:
1428 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1429 case Intrinsic::aarch64_sve_asr:
1430 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1431 .setMatchingIROpcode(Instruction::AShr);
1432 case Intrinsic::aarch64_sve_lsl:
1433 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1434 .setMatchingIROpcode(Instruction::Shl);
1435 case Intrinsic::aarch64_sve_lsr:
1436 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1437 .setMatchingIROpcode(Instruction::LShr);
1438 case Intrinsic::aarch64_sve_and:
1439 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1440 .setMatchingIROpcode(Instruction::And);
1441 case Intrinsic::aarch64_sve_bic:
1442 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1443 case Intrinsic::aarch64_sve_eor:
1444 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1445 .setMatchingIROpcode(Instruction::Xor);
1446 case Intrinsic::aarch64_sve_orr:
1447 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1448 .setMatchingIROpcode(Instruction::Or);
1449 case Intrinsic::aarch64_sve_sqsub:
1450 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1451 case Intrinsic::aarch64_sve_uqsub:
1452 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1453
1454 case Intrinsic::aarch64_sve_add_u:
1456 Instruction::Add);
1457 case Intrinsic::aarch64_sve_and_u:
1459 Instruction::And);
1460 case Intrinsic::aarch64_sve_asr_u:
1462 Instruction::AShr);
1463 case Intrinsic::aarch64_sve_eor_u:
1465 Instruction::Xor);
1466 case Intrinsic::aarch64_sve_fadd_u:
1468 Instruction::FAdd);
1469 case Intrinsic::aarch64_sve_fdiv_u:
1471 Instruction::FDiv);
1472 case Intrinsic::aarch64_sve_fmul_u:
1474 Instruction::FMul);
1475 case Intrinsic::aarch64_sve_fsub_u:
1477 Instruction::FSub);
1478 case Intrinsic::aarch64_sve_lsl_u:
1480 Instruction::Shl);
1481 case Intrinsic::aarch64_sve_lsr_u:
1483 Instruction::LShr);
1484 case Intrinsic::aarch64_sve_mul_u:
1486 Instruction::Mul);
1487 case Intrinsic::aarch64_sve_orr_u:
1489 Instruction::Or);
1490 case Intrinsic::aarch64_sve_sdiv_u:
1492 Instruction::SDiv);
1493 case Intrinsic::aarch64_sve_sub_u:
1495 Instruction::Sub);
1496 case Intrinsic::aarch64_sve_udiv_u:
1498 Instruction::UDiv);
1499
1500 case Intrinsic::aarch64_sve_addqv:
1501 case Intrinsic::aarch64_sve_and_z:
1502 case Intrinsic::aarch64_sve_bic_z:
1503 case Intrinsic::aarch64_sve_brka_z:
1504 case Intrinsic::aarch64_sve_brkb_z:
1505 case Intrinsic::aarch64_sve_brkn_z:
1506 case Intrinsic::aarch64_sve_brkpa_z:
1507 case Intrinsic::aarch64_sve_brkpb_z:
1508 case Intrinsic::aarch64_sve_cntp:
1509 case Intrinsic::aarch64_sve_compact:
1510 case Intrinsic::aarch64_sve_eor_z:
1511 case Intrinsic::aarch64_sve_eorv:
1512 case Intrinsic::aarch64_sve_eorqv:
1513 case Intrinsic::aarch64_sve_nand_z:
1514 case Intrinsic::aarch64_sve_nor_z:
1515 case Intrinsic::aarch64_sve_orn_z:
1516 case Intrinsic::aarch64_sve_orr_z:
1517 case Intrinsic::aarch64_sve_orv:
1518 case Intrinsic::aarch64_sve_orqv:
1519 case Intrinsic::aarch64_sve_pnext:
1520 case Intrinsic::aarch64_sve_rdffr_z:
1521 case Intrinsic::aarch64_sve_saddv:
1522 case Intrinsic::aarch64_sve_uaddv:
1523 case Intrinsic::aarch64_sve_umaxv:
1524 case Intrinsic::aarch64_sve_umaxqv:
1525 case Intrinsic::aarch64_sve_cmpeq:
1526 case Intrinsic::aarch64_sve_cmpeq_wide:
1527 case Intrinsic::aarch64_sve_cmpge:
1528 case Intrinsic::aarch64_sve_cmpge_wide:
1529 case Intrinsic::aarch64_sve_cmpgt:
1530 case Intrinsic::aarch64_sve_cmpgt_wide:
1531 case Intrinsic::aarch64_sve_cmphi:
1532 case Intrinsic::aarch64_sve_cmphi_wide:
1533 case Intrinsic::aarch64_sve_cmphs:
1534 case Intrinsic::aarch64_sve_cmphs_wide:
1535 case Intrinsic::aarch64_sve_cmple_wide:
1536 case Intrinsic::aarch64_sve_cmplo_wide:
1537 case Intrinsic::aarch64_sve_cmpls_wide:
1538 case Intrinsic::aarch64_sve_cmplt_wide:
1539 case Intrinsic::aarch64_sve_cmpne:
1540 case Intrinsic::aarch64_sve_cmpne_wide:
1541 case Intrinsic::aarch64_sve_facge:
1542 case Intrinsic::aarch64_sve_facgt:
1543 case Intrinsic::aarch64_sve_fcmpeq:
1544 case Intrinsic::aarch64_sve_fcmpge:
1545 case Intrinsic::aarch64_sve_fcmpgt:
1546 case Intrinsic::aarch64_sve_fcmpne:
1547 case Intrinsic::aarch64_sve_fcmpuo:
1548 case Intrinsic::aarch64_sve_ld1:
1549 case Intrinsic::aarch64_sve_ld1_gather:
1550 case Intrinsic::aarch64_sve_ld1_gather_index:
1551 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1552 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1553 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1554 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1555 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1556 case Intrinsic::aarch64_sve_ld1q_gather_index:
1557 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1558 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1559 case Intrinsic::aarch64_sve_ld1ro:
1560 case Intrinsic::aarch64_sve_ld1rq:
1561 case Intrinsic::aarch64_sve_ld1udq:
1562 case Intrinsic::aarch64_sve_ld1uwq:
1563 case Intrinsic::aarch64_sve_ld2_sret:
1564 case Intrinsic::aarch64_sve_ld2q_sret:
1565 case Intrinsic::aarch64_sve_ld3_sret:
1566 case Intrinsic::aarch64_sve_ld3q_sret:
1567 case Intrinsic::aarch64_sve_ld4_sret:
1568 case Intrinsic::aarch64_sve_ld4q_sret:
1569 case Intrinsic::aarch64_sve_ldff1:
1570 case Intrinsic::aarch64_sve_ldff1_gather:
1571 case Intrinsic::aarch64_sve_ldff1_gather_index:
1572 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1573 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1574 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1575 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1576 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1577 case Intrinsic::aarch64_sve_ldnf1:
1578 case Intrinsic::aarch64_sve_ldnt1:
1579 case Intrinsic::aarch64_sve_ldnt1_gather:
1580 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1581 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1582 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1584
1585 case Intrinsic::aarch64_sve_prf:
1586 case Intrinsic::aarch64_sve_prfb_gather_index:
1587 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1588 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1589 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1590 case Intrinsic::aarch64_sve_prfd_gather_index:
1591 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1592 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1593 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1594 case Intrinsic::aarch64_sve_prfh_gather_index:
1595 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1596 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1597 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1598 case Intrinsic::aarch64_sve_prfw_gather_index:
1599 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1600 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1601 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1603
1604 case Intrinsic::aarch64_sve_st1_scatter:
1605 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1606 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1607 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1608 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1609 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1610 case Intrinsic::aarch64_sve_st1dq:
1611 case Intrinsic::aarch64_sve_st1q_scatter_index:
1612 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1613 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1614 case Intrinsic::aarch64_sve_st1wq:
1615 case Intrinsic::aarch64_sve_stnt1:
1616 case Intrinsic::aarch64_sve_stnt1_scatter:
1617 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1618 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1619 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1621 case Intrinsic::aarch64_sve_st2:
1622 case Intrinsic::aarch64_sve_st2q:
1624 case Intrinsic::aarch64_sve_st3:
1625 case Intrinsic::aarch64_sve_st3q:
1627 case Intrinsic::aarch64_sve_st4:
1628 case Intrinsic::aarch64_sve_st4q:
1630 }
1631
1632 return SVEIntrinsicInfo();
1633}
1634
1635static bool isAllActivePredicate(Value *Pred) {
1636 Value *UncastedPred;
1637
1638 // Look through predicate casts that only remove lanes.
1640 m_Value(UncastedPred)))) {
1641 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1642 Pred = UncastedPred;
1643
1645 m_Value(UncastedPred))))
1646 // If the predicate has the same or less lanes than the uncasted predicate
1647 // then we know the casting has no effect.
1648 if (OrigPredTy->getMinNumElements() <=
1649 cast<ScalableVectorType>(UncastedPred->getType())
1650 ->getMinNumElements())
1651 Pred = UncastedPred;
1652 }
1653
1654 auto *C = dyn_cast<Constant>(Pred);
1655 return C && C->isAllOnesValue();
1656}
1657
1658// Simplify `V` by only considering the operations that affect active lanes.
1659// This function should only return existing Values or newly created Constants.
1660static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1661 auto *Dup = dyn_cast<IntrinsicInst>(V);
1662 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1663 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1665 cast<VectorType>(V->getType())->getElementCount(),
1666 cast<Constant>(Dup->getOperand(2)));
1667
1668 return V;
1669}
1670
1671static std::optional<Instruction *>
1673 const SVEIntrinsicInfo &IInfo) {
1674 const unsigned Opc = IInfo.getMatchingIROpode();
1675 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1676
1677 Value *Pg = II.getOperand(0);
1678 Value *Op1 = II.getOperand(1);
1679 Value *Op2 = II.getOperand(2);
1680 const DataLayout &DL = II.getDataLayout();
1681
1682 // Canonicalise constants to the RHS.
1684 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1685 IC.replaceOperand(II, 1, Op2);
1686 IC.replaceOperand(II, 2, Op1);
1687 return &II;
1688 }
1689
1690 // Only active lanes matter when simplifying the operation.
1691 Op1 = stripInactiveLanes(Op1, Pg);
1692 Op2 = stripInactiveLanes(Op2, Pg);
1693
1694 Value *SimpleII;
1695 if (auto FII = dyn_cast<FPMathOperator>(&II))
1696 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1697 else
1698 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1699
1700 // An SVE intrinsic's result is always defined. However, this is not the case
1701 // for its equivalent IR instruction (e.g. when shifting by an amount more
1702 // than the data's bitwidth). Simplifications to an undefined result must be
1703 // ignored to preserve the intrinsic's expected behaviour.
1704 if (!SimpleII || isa<UndefValue>(SimpleII))
1705 return std::nullopt;
1706
1707 if (IInfo.inactiveLanesAreNotDefined())
1708 return IC.replaceInstUsesWith(II, SimpleII);
1709
1710 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1711
1712 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1713 if (SimpleII == Inactive)
1714 return IC.replaceInstUsesWith(II, SimpleII);
1715
1716 // Inactive lanes must be preserved.
1717 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1718 return IC.replaceInstUsesWith(II, SimpleII);
1719}
1720
1721// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1722// to operations with less strict inactive lane requirements.
1723static std::optional<Instruction *>
1725 const SVEIntrinsicInfo &IInfo) {
1726 if (!IInfo.hasGoverningPredicate())
1727 return std::nullopt;
1728
1729 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1730
1731 // If there are no active lanes.
1732 if (match(OpPredicate, m_ZeroInt())) {
1734 return IC.replaceInstUsesWith(
1735 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1736
1737 if (IInfo.inactiveLanesAreUnused()) {
1738 if (IInfo.resultIsZeroInitialized())
1740
1741 return IC.eraseInstFromFunction(II);
1742 }
1743 }
1744
1745 // If there are no inactive lanes.
1746 if (isAllActivePredicate(OpPredicate)) {
1747 if (IInfo.hasOperandWithNoActiveLanes()) {
1748 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1749 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1750 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1751 }
1752
1753 if (IInfo.hasMatchingUndefIntrinsic()) {
1754 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1755 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1756 II.setCalledFunction(NewDecl);
1757 return &II;
1758 }
1759 }
1760
1761 // Operation specific simplifications.
1762 if (IInfo.hasMatchingIROpode() &&
1764 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1765
1766 return std::nullopt;
1767}
1768
1769// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1770// => (binop (pred) (from_svbool _) (from_svbool _))
1771//
1772// The above transformation eliminates a `to_svbool` in the predicate
1773// operand of bitwise operation `binop` by narrowing the vector width of
1774// the operation. For example, it would convert a `<vscale x 16 x i1>
1775// and` into a `<vscale x 4 x i1> and`. This is profitable because
1776// to_svbool must zero the new lanes during widening, whereas
1777// from_svbool is free.
1778static std::optional<Instruction *>
1780 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
1781 if (!BinOp)
1782 return std::nullopt;
1783
1784 auto IntrinsicID = BinOp->getIntrinsicID();
1785 switch (IntrinsicID) {
1786 case Intrinsic::aarch64_sve_and_z:
1787 case Intrinsic::aarch64_sve_bic_z:
1788 case Intrinsic::aarch64_sve_eor_z:
1789 case Intrinsic::aarch64_sve_nand_z:
1790 case Intrinsic::aarch64_sve_nor_z:
1791 case Intrinsic::aarch64_sve_orn_z:
1792 case Intrinsic::aarch64_sve_orr_z:
1793 break;
1794 default:
1795 return std::nullopt;
1796 }
1797
1798 auto BinOpPred = BinOp->getOperand(0);
1799 auto BinOpOp1 = BinOp->getOperand(1);
1800 auto BinOpOp2 = BinOp->getOperand(2);
1801
1802 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1803 if (!PredIntr ||
1804 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1805 return std::nullopt;
1806
1807 auto PredOp = PredIntr->getOperand(0);
1808 auto PredOpTy = cast<VectorType>(PredOp->getType());
1809 if (PredOpTy != II.getType())
1810 return std::nullopt;
1811
1812 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1813 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1814 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1815 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1816 if (BinOpOp1 == BinOpOp2)
1817 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1818 else
1819 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1820 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1821
1822 auto NarrowedBinOp =
1823 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1824 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1825}
1826
1827static std::optional<Instruction *>
1829 // If the reinterpret instruction operand is a PHI Node
1830 if (isa<PHINode>(II.getArgOperand(0)))
1831 return processPhiNode(IC, II);
1832
1833 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1834 return BinOpCombine;
1835
1836 // Ignore converts to/from svcount_t.
1837 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1838 isa<TargetExtType>(II.getType()))
1839 return std::nullopt;
1840
1841 SmallVector<Instruction *, 32> CandidatesForRemoval;
1842 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1843
1844 const auto *IVTy = cast<VectorType>(II.getType());
1845
1846 // Walk the chain of conversions.
1847 while (Cursor) {
1848 // If the type of the cursor has fewer lanes than the final result, zeroing
1849 // must take place, which breaks the equivalence chain.
1850 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1851 if (CursorVTy->getElementCount().getKnownMinValue() <
1852 IVTy->getElementCount().getKnownMinValue())
1853 break;
1854
1855 // If the cursor has the same type as I, it is a viable replacement.
1856 if (Cursor->getType() == IVTy)
1857 EarliestReplacement = Cursor;
1858
1859 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1860
1861 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1862 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1863 Intrinsic::aarch64_sve_convert_to_svbool ||
1864 IntrinsicCursor->getIntrinsicID() ==
1865 Intrinsic::aarch64_sve_convert_from_svbool))
1866 break;
1867
1868 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1869 Cursor = IntrinsicCursor->getOperand(0);
1870 }
1871
1872 // If no viable replacement in the conversion chain was found, there is
1873 // nothing to do.
1874 if (!EarliestReplacement)
1875 return std::nullopt;
1876
1877 return IC.replaceInstUsesWith(II, EarliestReplacement);
1878}
1879
1880static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1881 IntrinsicInst &II) {
1882 // svsel(ptrue, x, y) => x
1883 auto *OpPredicate = II.getOperand(0);
1884 if (isAllActivePredicate(OpPredicate))
1885 return IC.replaceInstUsesWith(II, II.getOperand(1));
1886
1887 auto Select =
1888 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1889 return IC.replaceInstUsesWith(II, Select);
1890}
1891
1892static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1893 IntrinsicInst &II) {
1894 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1895 if (!Pg)
1896 return std::nullopt;
1897
1898 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1899 return std::nullopt;
1900
1901 const auto PTruePattern =
1902 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1903 if (PTruePattern != AArch64SVEPredPattern::vl1)
1904 return std::nullopt;
1905
1906 // The intrinsic is inserting into lane zero so use an insert instead.
1907 auto *IdxTy = Type::getInt64Ty(II.getContext());
1908 auto *Insert = InsertElementInst::Create(
1909 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1910 Insert->insertBefore(II.getIterator());
1911 Insert->takeName(&II);
1912
1913 return IC.replaceInstUsesWith(II, Insert);
1914}
1915
1916static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1917 IntrinsicInst &II) {
1918 // Replace DupX with a regular IR splat.
1919 auto *RetTy = cast<ScalableVectorType>(II.getType());
1920 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1921 II.getArgOperand(0));
1922 Splat->takeName(&II);
1923 return IC.replaceInstUsesWith(II, Splat);
1924}
1925
1926static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1927 IntrinsicInst &II) {
1928 LLVMContext &Ctx = II.getContext();
1929
1930 if (!isAllActivePredicate(II.getArgOperand(0)))
1931 return std::nullopt;
1932
1933 // Check that we have a compare of zero..
1934 auto *SplatValue =
1936 if (!SplatValue || !SplatValue->isZero())
1937 return std::nullopt;
1938
1939 // ..against a dupq
1940 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1941 if (!DupQLane ||
1942 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1943 return std::nullopt;
1944
1945 // Where the dupq is a lane 0 replicate of a vector insert
1946 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1947 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1948 return std::nullopt;
1949
1950 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1951 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1952 return std::nullopt;
1953
1954 // Where the vector insert is a fixed constant vector insert into undef at
1955 // index zero
1956 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1957 return std::nullopt;
1958
1959 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1960 return std::nullopt;
1961
1962 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1963 if (!ConstVec)
1964 return std::nullopt;
1965
1966 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1967 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1968 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1969 return std::nullopt;
1970
1971 unsigned NumElts = VecTy->getNumElements();
1972 unsigned PredicateBits = 0;
1973
1974 // Expand intrinsic operands to a 16-bit byte level predicate
1975 for (unsigned I = 0; I < NumElts; ++I) {
1976 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1977 if (!Arg)
1978 return std::nullopt;
1979 if (!Arg->isZero())
1980 PredicateBits |= 1 << (I * (16 / NumElts));
1981 }
1982
1983 // If all bits are zero bail early with an empty predicate
1984 if (PredicateBits == 0) {
1985 auto *PFalse = Constant::getNullValue(II.getType());
1986 PFalse->takeName(&II);
1987 return IC.replaceInstUsesWith(II, PFalse);
1988 }
1989
1990 // Calculate largest predicate type used (where byte predicate is largest)
1991 unsigned Mask = 8;
1992 for (unsigned I = 0; I < 16; ++I)
1993 if ((PredicateBits & (1 << I)) != 0)
1994 Mask |= (I % 8);
1995
1996 unsigned PredSize = Mask & -Mask;
1997 auto *PredType = ScalableVectorType::get(
1998 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1999
2000 // Ensure all relevant bits are set
2001 for (unsigned I = 0; I < 16; I += PredSize)
2002 if ((PredicateBits & (1 << I)) == 0)
2003 return std::nullopt;
2004
2005 auto *PTruePat =
2006 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2007 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2008 {PredType}, {PTruePat});
2009 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2010 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
2011 auto *ConvertFromSVBool =
2012 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2013 {II.getType()}, {ConvertToSVBool});
2014
2015 ConvertFromSVBool->takeName(&II);
2016 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2017}
2018
2019static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2020 IntrinsicInst &II) {
2021 Value *Pg = II.getArgOperand(0);
2022 Value *Vec = II.getArgOperand(1);
2023 auto IntrinsicID = II.getIntrinsicID();
2024 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2025
2026 // lastX(splat(X)) --> X
2027 if (auto *SplatVal = getSplatValue(Vec))
2028 return IC.replaceInstUsesWith(II, SplatVal);
2029
2030 // If x and/or y is a splat value then:
2031 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2032 Value *LHS, *RHS;
2033 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2034 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2035 auto *OldBinOp = cast<BinaryOperator>(Vec);
2036 auto OpC = OldBinOp->getOpcode();
2037 auto *NewLHS =
2038 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2039 auto *NewRHS =
2040 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2042 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2043 return IC.replaceInstUsesWith(II, NewBinOp);
2044 }
2045 }
2046
2047 auto *C = dyn_cast<Constant>(Pg);
2048 if (IsAfter && C && C->isNullValue()) {
2049 // The intrinsic is extracting lane 0 so use an extract instead.
2050 auto *IdxTy = Type::getInt64Ty(II.getContext());
2051 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2052 Extract->insertBefore(II.getIterator());
2053 Extract->takeName(&II);
2054 return IC.replaceInstUsesWith(II, Extract);
2055 }
2056
2057 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2058 if (!IntrPG)
2059 return std::nullopt;
2060
2061 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2062 return std::nullopt;
2063
2064 const auto PTruePattern =
2065 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2066
2067 // Can the intrinsic's predicate be converted to a known constant index?
2068 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2069 if (!MinNumElts)
2070 return std::nullopt;
2071
2072 unsigned Idx = MinNumElts - 1;
2073 // Increment the index if extracting the element after the last active
2074 // predicate element.
2075 if (IsAfter)
2076 ++Idx;
2077
2078 // Ignore extracts whose index is larger than the known minimum vector
2079 // length. NOTE: This is an artificial constraint where we prefer to
2080 // maintain what the user asked for until an alternative is proven faster.
2081 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2082 if (Idx >= PgVTy->getMinNumElements())
2083 return std::nullopt;
2084
2085 // The intrinsic is extracting a fixed lane so use an extract instead.
2086 auto *IdxTy = Type::getInt64Ty(II.getContext());
2087 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2088 Extract->insertBefore(II.getIterator());
2089 Extract->takeName(&II);
2090 return IC.replaceInstUsesWith(II, Extract);
2091}
2092
2093static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2094 IntrinsicInst &II) {
2095 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2096 // integer variant across a variety of micro-architectures. Replace scalar
2097 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2098 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2099 // depending on the micro-architecture, but has been observed as generally
2100 // being faster, particularly when the CLAST[AB] op is a loop-carried
2101 // dependency.
2102 Value *Pg = II.getArgOperand(0);
2103 Value *Fallback = II.getArgOperand(1);
2104 Value *Vec = II.getArgOperand(2);
2105 Type *Ty = II.getType();
2106
2107 if (!Ty->isIntegerTy())
2108 return std::nullopt;
2109
2110 Type *FPTy;
2111 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2112 default:
2113 return std::nullopt;
2114 case 16:
2115 FPTy = IC.Builder.getHalfTy();
2116 break;
2117 case 32:
2118 FPTy = IC.Builder.getFloatTy();
2119 break;
2120 case 64:
2121 FPTy = IC.Builder.getDoubleTy();
2122 break;
2123 }
2124
2125 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2126 auto *FPVTy = VectorType::get(
2127 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2128 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2129 auto *FPII = IC.Builder.CreateIntrinsic(
2130 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2131 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2132 return IC.replaceInstUsesWith(II, FPIItoInt);
2133}
2134
2135static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 LLVMContext &Ctx = II.getContext();
2138 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2139 // can work with RDFFR_PP for ptest elimination.
2140 auto *AllPat =
2141 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
2142 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
2143 {II.getType()}, {AllPat});
2144 auto *RDFFR =
2145 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue});
2146 RDFFR->takeName(&II);
2147 return IC.replaceInstUsesWith(II, RDFFR);
2148}
2149
2150static std::optional<Instruction *>
2152 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2153
2154 if (Pattern == AArch64SVEPredPattern::all) {
2156 II.getType(), ElementCount::getScalable(NumElts));
2157 Cnt->takeName(&II);
2158 return IC.replaceInstUsesWith(II, Cnt);
2159 }
2160
2161 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2162
2163 return MinNumElts && NumElts >= MinNumElts
2164 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2165 II, ConstantInt::get(II.getType(), MinNumElts)))
2166 : std::nullopt;
2167}
2168
2169static std::optional<Instruction *>
2171 const AArch64Subtarget *ST) {
2172 if (!ST->isStreaming())
2173 return std::nullopt;
2174
2175 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2176 // with SVEPredPattern::all
2177 Value *Cnt =
2179 Cnt->takeName(&II);
2180 return IC.replaceInstUsesWith(II, Cnt);
2181}
2182
2183static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2184 IntrinsicInst &II) {
2185 Value *PgVal = II.getArgOperand(0);
2186 Value *OpVal = II.getArgOperand(1);
2187
2188 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2189 // Later optimizations prefer this form.
2190 if (PgVal == OpVal &&
2191 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2192 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2193 Value *Ops[] = {PgVal, OpVal};
2194 Type *Tys[] = {PgVal->getType()};
2195
2196 auto *PTest =
2197 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2198 PTest->takeName(&II);
2199
2200 return IC.replaceInstUsesWith(II, PTest);
2201 }
2202
2205
2206 if (!Pg || !Op)
2207 return std::nullopt;
2208
2209 Intrinsic::ID OpIID = Op->getIntrinsicID();
2210
2211 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2212 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2213 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2214 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2215 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2216
2217 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2218
2219 PTest->takeName(&II);
2220 return IC.replaceInstUsesWith(II, PTest);
2221 }
2222
2223 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2224 // Later optimizations may rewrite sequence to use the flag-setting variant
2225 // of instruction X to remove PTEST.
2226 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2227 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2228 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2229 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2230 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2231 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2232 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2233 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2234 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2235 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2236 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2237 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2238 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2239 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2240 Type *Tys[] = {Pg->getType()};
2241
2242 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2243 PTest->takeName(&II);
2244
2245 return IC.replaceInstUsesWith(II, PTest);
2246 }
2247
2248 return std::nullopt;
2249}
2250
2251template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2252static std::optional<Instruction *>
2254 bool MergeIntoAddendOp) {
2255 Value *P = II.getOperand(0);
2256 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2257 if (MergeIntoAddendOp) {
2258 AddendOp = II.getOperand(1);
2259 Mul = II.getOperand(2);
2260 } else {
2261 AddendOp = II.getOperand(2);
2262 Mul = II.getOperand(1);
2263 }
2264
2266 m_Value(MulOp1))))
2267 return std::nullopt;
2268
2269 if (!Mul->hasOneUse())
2270 return std::nullopt;
2271
2272 Instruction *FMFSource = nullptr;
2273 if (II.getType()->isFPOrFPVectorTy()) {
2274 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2275 // Stop the combine when the flags on the inputs differ in case dropping
2276 // flags would lead to us missing out on more beneficial optimizations.
2277 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2278 return std::nullopt;
2279 if (!FAddFlags.allowContract())
2280 return std::nullopt;
2281 FMFSource = &II;
2282 }
2283
2284 CallInst *Res;
2285 if (MergeIntoAddendOp)
2286 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2287 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2288 else
2289 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2290 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2291
2292 return IC.replaceInstUsesWith(II, Res);
2293}
2294
2295static std::optional<Instruction *>
2297 Value *Pred = II.getOperand(0);
2298 Value *PtrOp = II.getOperand(1);
2299 Type *VecTy = II.getType();
2300
2301 if (isAllActivePredicate(Pred)) {
2302 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2303 Load->copyMetadata(II);
2304 return IC.replaceInstUsesWith(II, Load);
2305 }
2306
2307 CallInst *MaskedLoad =
2308 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2309 Pred, ConstantAggregateZero::get(VecTy));
2310 MaskedLoad->copyMetadata(II);
2311 return IC.replaceInstUsesWith(II, MaskedLoad);
2312}
2313
2314static std::optional<Instruction *>
2316 Value *VecOp = II.getOperand(0);
2317 Value *Pred = II.getOperand(1);
2318 Value *PtrOp = II.getOperand(2);
2319
2320 if (isAllActivePredicate(Pred)) {
2321 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2322 Store->copyMetadata(II);
2323 return IC.eraseInstFromFunction(II);
2324 }
2325
2326 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2327 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2328 MaskedStore->copyMetadata(II);
2329 return IC.eraseInstFromFunction(II);
2330}
2331
2333 switch (Intrinsic) {
2334 case Intrinsic::aarch64_sve_fmul_u:
2335 return Instruction::BinaryOps::FMul;
2336 case Intrinsic::aarch64_sve_fadd_u:
2337 return Instruction::BinaryOps::FAdd;
2338 case Intrinsic::aarch64_sve_fsub_u:
2339 return Instruction::BinaryOps::FSub;
2340 default:
2341 return Instruction::BinaryOpsEnd;
2342 }
2343}
2344
2345static std::optional<Instruction *>
2347 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2348 if (II.isStrictFP())
2349 return std::nullopt;
2350
2351 auto *OpPredicate = II.getOperand(0);
2352 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2353 if (BinOpCode == Instruction::BinaryOpsEnd ||
2354 !isAllActivePredicate(OpPredicate))
2355 return std::nullopt;
2356 auto BinOp = IC.Builder.CreateBinOpFMF(
2357 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2358 return IC.replaceInstUsesWith(II, BinOp);
2359}
2360
2361static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2362 IntrinsicInst &II) {
2363 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2364 Intrinsic::aarch64_sve_mla>(
2365 IC, II, true))
2366 return MLA;
2367 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2368 Intrinsic::aarch64_sve_mad>(
2369 IC, II, false))
2370 return MAD;
2371 return std::nullopt;
2372}
2373
2374static std::optional<Instruction *>
2376 if (auto FMLA =
2377 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2378 Intrinsic::aarch64_sve_fmla>(IC, II,
2379 true))
2380 return FMLA;
2381 if (auto FMAD =
2382 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2383 Intrinsic::aarch64_sve_fmad>(IC, II,
2384 false))
2385 return FMAD;
2386 if (auto FMLA =
2387 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2388 Intrinsic::aarch64_sve_fmla>(IC, II,
2389 true))
2390 return FMLA;
2391 return std::nullopt;
2392}
2393
2394static std::optional<Instruction *>
2396 if (auto FMLA =
2397 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2398 Intrinsic::aarch64_sve_fmla>(IC, II,
2399 true))
2400 return FMLA;
2401 if (auto FMAD =
2402 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2403 Intrinsic::aarch64_sve_fmad>(IC, II,
2404 false))
2405 return FMAD;
2406 if (auto FMLA_U =
2407 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2408 Intrinsic::aarch64_sve_fmla_u>(
2409 IC, II, true))
2410 return FMLA_U;
2411 return instCombineSVEVectorBinOp(IC, II);
2412}
2413
2414static std::optional<Instruction *>
2416 if (auto FMLS =
2417 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2418 Intrinsic::aarch64_sve_fmls>(IC, II,
2419 true))
2420 return FMLS;
2421 if (auto FMSB =
2422 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2423 Intrinsic::aarch64_sve_fnmsb>(
2424 IC, II, false))
2425 return FMSB;
2426 if (auto FMLS =
2427 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2428 Intrinsic::aarch64_sve_fmls>(IC, II,
2429 true))
2430 return FMLS;
2431 return std::nullopt;
2432}
2433
2434static std::optional<Instruction *>
2436 if (auto FMLS =
2437 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2438 Intrinsic::aarch64_sve_fmls>(IC, II,
2439 true))
2440 return FMLS;
2441 if (auto FMSB =
2442 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2443 Intrinsic::aarch64_sve_fnmsb>(
2444 IC, II, false))
2445 return FMSB;
2446 if (auto FMLS_U =
2447 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2448 Intrinsic::aarch64_sve_fmls_u>(
2449 IC, II, true))
2450 return FMLS_U;
2451 return instCombineSVEVectorBinOp(IC, II);
2452}
2453
2454static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2455 IntrinsicInst &II) {
2456 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2457 Intrinsic::aarch64_sve_mls>(
2458 IC, II, true))
2459 return MLS;
2460 return std::nullopt;
2461}
2462
2463static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2464 IntrinsicInst &II) {
2465 Value *UnpackArg = II.getArgOperand(0);
2466 auto *RetTy = cast<ScalableVectorType>(II.getType());
2467 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2468 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2469
2470 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2471 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2472 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2473 ScalarArg =
2474 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2475 Value *NewVal =
2476 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2477 NewVal->takeName(&II);
2478 return IC.replaceInstUsesWith(II, NewVal);
2479 }
2480
2481 return std::nullopt;
2482}
2483static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2484 IntrinsicInst &II) {
2485 auto *OpVal = II.getOperand(0);
2486 auto *OpIndices = II.getOperand(1);
2487 VectorType *VTy = cast<VectorType>(II.getType());
2488
2489 // Check whether OpIndices is a constant splat value < minimal element count
2490 // of result.
2491 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2492 if (!SplatValue ||
2493 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2494 return std::nullopt;
2495
2496 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2497 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2498 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2499 auto *VectorSplat =
2500 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2501
2502 VectorSplat->takeName(&II);
2503 return IC.replaceInstUsesWith(II, VectorSplat);
2504}
2505
2506static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2507 IntrinsicInst &II) {
2508 Value *A, *B;
2509 Type *RetTy = II.getType();
2510 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2511 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2512
2513 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2514 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2515 if ((match(II.getArgOperand(0),
2517 match(II.getArgOperand(1),
2519 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2520 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2521 auto *TyA = cast<ScalableVectorType>(A->getType());
2522 if (TyA == B->getType() &&
2524 auto *SubVec = IC.Builder.CreateInsertVector(
2525 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2526 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2527 TyA->getMinNumElements());
2528 ConcatVec->takeName(&II);
2529 return IC.replaceInstUsesWith(II, ConcatVec);
2530 }
2531 }
2532
2533 return std::nullopt;
2534}
2535
2536static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2537 IntrinsicInst &II) {
2538 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2539 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2540 Value *A, *B;
2541 if (match(II.getArgOperand(0),
2544 m_Specific(A), m_Specific(B))))
2545 return IC.replaceInstUsesWith(
2546 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2547
2548 return std::nullopt;
2549}
2550
2551static std::optional<Instruction *>
2553 Value *Mask = II.getOperand(0);
2554 Value *BasePtr = II.getOperand(1);
2555 Value *Index = II.getOperand(2);
2556 Type *Ty = II.getType();
2557 Value *PassThru = ConstantAggregateZero::get(Ty);
2558
2559 // Contiguous gather => masked load.
2560 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2561 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2562 Value *IndexBase;
2564 m_Value(IndexBase), m_SpecificInt(1)))) {
2565 Align Alignment =
2566 BasePtr->getPointerAlignment(II.getDataLayout());
2567
2568 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2569 BasePtr, IndexBase);
2570 CallInst *MaskedLoad =
2571 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2572 MaskedLoad->takeName(&II);
2573 return IC.replaceInstUsesWith(II, MaskedLoad);
2574 }
2575
2576 return std::nullopt;
2577}
2578
2579static std::optional<Instruction *>
2581 Value *Val = II.getOperand(0);
2582 Value *Mask = II.getOperand(1);
2583 Value *BasePtr = II.getOperand(2);
2584 Value *Index = II.getOperand(3);
2585 Type *Ty = Val->getType();
2586
2587 // Contiguous scatter => masked store.
2588 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2589 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2590 Value *IndexBase;
2592 m_Value(IndexBase), m_SpecificInt(1)))) {
2593 Align Alignment =
2594 BasePtr->getPointerAlignment(II.getDataLayout());
2595
2596 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2597 BasePtr, IndexBase);
2598 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2599
2600 return IC.eraseInstFromFunction(II);
2601 }
2602
2603 return std::nullopt;
2604}
2605
2606static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2607 IntrinsicInst &II) {
2609 Value *Pred = II.getOperand(0);
2610 Value *Vec = II.getOperand(1);
2611 Value *DivVec = II.getOperand(2);
2612
2613 Value *SplatValue = getSplatValue(DivVec);
2614 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2615 if (!SplatConstantInt)
2616 return std::nullopt;
2617
2618 APInt Divisor = SplatConstantInt->getValue();
2619 const int64_t DivisorValue = Divisor.getSExtValue();
2620 if (DivisorValue == -1)
2621 return std::nullopt;
2622 if (DivisorValue == 1)
2623 IC.replaceInstUsesWith(II, Vec);
2624
2625 if (Divisor.isPowerOf2()) {
2626 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2627 auto ASRD = IC.Builder.CreateIntrinsic(
2628 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2629 return IC.replaceInstUsesWith(II, ASRD);
2630 }
2631 if (Divisor.isNegatedPowerOf2()) {
2632 Divisor.negate();
2633 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2634 auto ASRD = IC.Builder.CreateIntrinsic(
2635 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2636 auto NEG = IC.Builder.CreateIntrinsic(
2637 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2638 return IC.replaceInstUsesWith(II, NEG);
2639 }
2640
2641 return std::nullopt;
2642}
2643
2644bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2645 size_t VecSize = Vec.size();
2646 if (VecSize == 1)
2647 return true;
2648 if (!isPowerOf2_64(VecSize))
2649 return false;
2650 size_t HalfVecSize = VecSize / 2;
2651
2652 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2653 RHS != Vec.end(); LHS++, RHS++) {
2654 if (*LHS != nullptr && *RHS != nullptr) {
2655 if (*LHS == *RHS)
2656 continue;
2657 else
2658 return false;
2659 }
2660 if (!AllowPoison)
2661 return false;
2662 if (*LHS == nullptr && *RHS != nullptr)
2663 *LHS = *RHS;
2664 }
2665
2666 Vec.resize(HalfVecSize);
2667 SimplifyValuePattern(Vec, AllowPoison);
2668 return true;
2669}
2670
2671// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2672// to dupqlane(f64(C)) where C is A concatenated with B
2673static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2674 IntrinsicInst &II) {
2675 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2676 if (!match(II.getOperand(0),
2678 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2679 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2680 return std::nullopt;
2681 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2682
2683 // Insert the scalars into a container ordered by InsertElement index
2684 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2685 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2686 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2687 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2688 CurrentInsertElt = InsertElt->getOperand(0);
2689 }
2690
2691 bool AllowPoison =
2692 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2693 if (!SimplifyValuePattern(Elts, AllowPoison))
2694 return std::nullopt;
2695
2696 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2697 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2698 for (size_t I = 0; I < Elts.size(); I++) {
2699 if (Elts[I] == nullptr)
2700 continue;
2701 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2702 IC.Builder.getInt64(I));
2703 }
2704 if (InsertEltChain == nullptr)
2705 return std::nullopt;
2706
2707 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2708 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2709 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2710 // be narrowed back to the original type.
2711 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2712 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2713 IIScalableTy->getMinNumElements() /
2714 PatternWidth;
2715
2716 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2717 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2718 auto *WideShuffleMaskTy =
2719 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2720
2721 auto InsertSubvector = IC.Builder.CreateInsertVector(
2722 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2723 uint64_t(0));
2724 auto WideBitcast =
2725 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2726 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2727 auto WideShuffle = IC.Builder.CreateShuffleVector(
2728 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2729 auto NarrowBitcast =
2730 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2731
2732 return IC.replaceInstUsesWith(II, NarrowBitcast);
2733}
2734
2735static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2736 IntrinsicInst &II) {
2737 Value *A = II.getArgOperand(0);
2738 Value *B = II.getArgOperand(1);
2739 if (A == B)
2740 return IC.replaceInstUsesWith(II, A);
2741
2742 return std::nullopt;
2743}
2744
2745static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2746 IntrinsicInst &II) {
2747 Value *Pred = II.getOperand(0);
2748 Value *Vec = II.getOperand(1);
2749 Value *Shift = II.getOperand(2);
2750
2751 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2752 Value *AbsPred, *MergedValue;
2754 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2756 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2757
2758 return std::nullopt;
2759
2760 // Transform is valid if any of the following are true:
2761 // * The ABS merge value is an undef or non-negative
2762 // * The ABS predicate is all active
2763 // * The ABS predicate and the SRSHL predicates are the same
2764 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2765 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2766 return std::nullopt;
2767
2768 // Only valid when the shift amount is non-negative, otherwise the rounding
2769 // behaviour of SRSHL cannot be ignored.
2770 if (!match(Shift, m_NonNegative()))
2771 return std::nullopt;
2772
2773 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2774 {II.getType()}, {Pred, Vec, Shift});
2775
2776 return IC.replaceInstUsesWith(II, LSL);
2777}
2778
2779static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2780 IntrinsicInst &II) {
2781 Value *Vec = II.getOperand(0);
2782
2783 if (getSplatValue(Vec) == II.getOperand(1))
2784 return IC.replaceInstUsesWith(II, Vec);
2785
2786 return std::nullopt;
2787}
2788
2789static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2790 IntrinsicInst &II) {
2791 // If this barrier is post-dominated by identical one we can remove it
2792 auto *NI = II.getNextNode();
2793 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2794 auto CanSkipOver = [](Instruction *I) {
2795 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2796 };
2797 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2798 auto *NIBB = NI->getParent();
2799 NI = NI->getNextNode();
2800 if (!NI) {
2801 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2802 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2803 else
2804 break;
2805 }
2806 }
2807 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2808 if (NextII && II.isIdenticalTo(NextII))
2809 return IC.eraseInstFromFunction(II);
2810
2811 return std::nullopt;
2812}
2813
2814static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2815 IntrinsicInst &II) {
2816 return IC.replaceInstUsesWith(
2817 II,
2818 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
2819 {II.getType(), II.getOperand(0)->getType()},
2820 {II.getOperand(0), II.getOperand(1)}));
2821}
2822
2823static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2824 IntrinsicInst &II) {
2826 return IC.replaceInstUsesWith(II, Constant::getAllOnesValue(II.getType()));
2827 return std::nullopt;
2828}
2829
2830static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2832 unsigned NumBits) {
2833 Value *Passthru = II.getOperand(0);
2834 Value *Pg = II.getOperand(1);
2835 Value *Op = II.getOperand(2);
2836
2837 // Convert UXT[BHW] to AND.
2838 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2839 auto *Ty = cast<VectorType>(II.getType());
2840 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2841 auto *Mask = ConstantInt::get(Ty, MaskValue);
2842 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2843 {Pg, Op, Mask});
2844 return IC.replaceInstUsesWith(II, And);
2845 }
2846
2847 return std::nullopt;
2848}
2849
2850static std::optional<Instruction *>
2852 SMEAttrs FnSMEAttrs(*II.getFunction());
2853 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2854 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
2855 return IC.replaceInstUsesWith(
2856 II, ConstantInt::getBool(II.getType(), IsStreaming));
2857 return std::nullopt;
2858}
2859
2860std::optional<Instruction *>
2862 IntrinsicInst &II) const {
2864 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2865 return I;
2866
2867 Intrinsic::ID IID = II.getIntrinsicID();
2868 switch (IID) {
2869 default:
2870 break;
2871 case Intrinsic::aarch64_dmb:
2872 return instCombineDMB(IC, II);
2873 case Intrinsic::aarch64_neon_fmaxnm:
2874 case Intrinsic::aarch64_neon_fminnm:
2875 return instCombineMaxMinNM(IC, II);
2876 case Intrinsic::aarch64_sve_convert_from_svbool:
2877 return instCombineConvertFromSVBool(IC, II);
2878 case Intrinsic::aarch64_sve_dup:
2879 return instCombineSVEDup(IC, II);
2880 case Intrinsic::aarch64_sve_dup_x:
2881 return instCombineSVEDupX(IC, II);
2882 case Intrinsic::aarch64_sve_cmpne:
2883 case Intrinsic::aarch64_sve_cmpne_wide:
2884 return instCombineSVECmpNE(IC, II);
2885 case Intrinsic::aarch64_sve_rdffr:
2886 return instCombineRDFFR(IC, II);
2887 case Intrinsic::aarch64_sve_lasta:
2888 case Intrinsic::aarch64_sve_lastb:
2889 return instCombineSVELast(IC, II);
2890 case Intrinsic::aarch64_sve_clasta_n:
2891 case Intrinsic::aarch64_sve_clastb_n:
2892 return instCombineSVECondLast(IC, II);
2893 case Intrinsic::aarch64_sve_cntd:
2894 return instCombineSVECntElts(IC, II, 2);
2895 case Intrinsic::aarch64_sve_cntw:
2896 return instCombineSVECntElts(IC, II, 4);
2897 case Intrinsic::aarch64_sve_cnth:
2898 return instCombineSVECntElts(IC, II, 8);
2899 case Intrinsic::aarch64_sve_cntb:
2900 return instCombineSVECntElts(IC, II, 16);
2901 case Intrinsic::aarch64_sme_cntsd:
2902 return instCombineSMECntsd(IC, II, ST);
2903 case Intrinsic::aarch64_sve_ptest_any:
2904 case Intrinsic::aarch64_sve_ptest_first:
2905 case Intrinsic::aarch64_sve_ptest_last:
2906 return instCombineSVEPTest(IC, II);
2907 case Intrinsic::aarch64_sve_fadd:
2908 return instCombineSVEVectorFAdd(IC, II);
2909 case Intrinsic::aarch64_sve_fadd_u:
2910 return instCombineSVEVectorFAddU(IC, II);
2911 case Intrinsic::aarch64_sve_fmul_u:
2912 return instCombineSVEVectorBinOp(IC, II);
2913 case Intrinsic::aarch64_sve_fsub:
2914 return instCombineSVEVectorFSub(IC, II);
2915 case Intrinsic::aarch64_sve_fsub_u:
2916 return instCombineSVEVectorFSubU(IC, II);
2917 case Intrinsic::aarch64_sve_add:
2918 return instCombineSVEVectorAdd(IC, II);
2919 case Intrinsic::aarch64_sve_add_u:
2920 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2921 Intrinsic::aarch64_sve_mla_u>(
2922 IC, II, true);
2923 case Intrinsic::aarch64_sve_sub:
2924 return instCombineSVEVectorSub(IC, II);
2925 case Intrinsic::aarch64_sve_sub_u:
2926 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2927 Intrinsic::aarch64_sve_mls_u>(
2928 IC, II, true);
2929 case Intrinsic::aarch64_sve_tbl:
2930 return instCombineSVETBL(IC, II);
2931 case Intrinsic::aarch64_sve_uunpkhi:
2932 case Intrinsic::aarch64_sve_uunpklo:
2933 case Intrinsic::aarch64_sve_sunpkhi:
2934 case Intrinsic::aarch64_sve_sunpklo:
2935 return instCombineSVEUnpack(IC, II);
2936 case Intrinsic::aarch64_sve_uzp1:
2937 return instCombineSVEUzp1(IC, II);
2938 case Intrinsic::aarch64_sve_zip1:
2939 case Intrinsic::aarch64_sve_zip2:
2940 return instCombineSVEZip(IC, II);
2941 case Intrinsic::aarch64_sve_ld1_gather_index:
2942 return instCombineLD1GatherIndex(IC, II);
2943 case Intrinsic::aarch64_sve_st1_scatter_index:
2944 return instCombineST1ScatterIndex(IC, II);
2945 case Intrinsic::aarch64_sve_ld1:
2946 return instCombineSVELD1(IC, II, DL);
2947 case Intrinsic::aarch64_sve_st1:
2948 return instCombineSVEST1(IC, II, DL);
2949 case Intrinsic::aarch64_sve_sdiv:
2950 return instCombineSVESDIV(IC, II);
2951 case Intrinsic::aarch64_sve_sel:
2952 return instCombineSVESel(IC, II);
2953 case Intrinsic::aarch64_sve_srshl:
2954 return instCombineSVESrshl(IC, II);
2955 case Intrinsic::aarch64_sve_dupq_lane:
2956 return instCombineSVEDupqLane(IC, II);
2957 case Intrinsic::aarch64_sve_insr:
2958 return instCombineSVEInsr(IC, II);
2959 case Intrinsic::aarch64_sve_whilelo:
2960 return instCombineWhilelo(IC, II);
2961 case Intrinsic::aarch64_sve_ptrue:
2962 return instCombinePTrue(IC, II);
2963 case Intrinsic::aarch64_sve_uxtb:
2964 return instCombineSVEUxt(IC, II, 8);
2965 case Intrinsic::aarch64_sve_uxth:
2966 return instCombineSVEUxt(IC, II, 16);
2967 case Intrinsic::aarch64_sve_uxtw:
2968 return instCombineSVEUxt(IC, II, 32);
2969 case Intrinsic::aarch64_sme_in_streaming_mode:
2970 return instCombineInStreamingMode(IC, II);
2971 }
2972
2973 return std::nullopt;
2974}
2975
2977 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2978 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2979 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2980 SimplifyAndSetOp) const {
2981 switch (II.getIntrinsicID()) {
2982 default:
2983 break;
2984 case Intrinsic::aarch64_neon_fcvtxn:
2985 case Intrinsic::aarch64_neon_rshrn:
2986 case Intrinsic::aarch64_neon_sqrshrn:
2987 case Intrinsic::aarch64_neon_sqrshrun:
2988 case Intrinsic::aarch64_neon_sqshrn:
2989 case Intrinsic::aarch64_neon_sqshrun:
2990 case Intrinsic::aarch64_neon_sqxtn:
2991 case Intrinsic::aarch64_neon_sqxtun:
2992 case Intrinsic::aarch64_neon_uqrshrn:
2993 case Intrinsic::aarch64_neon_uqshrn:
2994 case Intrinsic::aarch64_neon_uqxtn:
2995 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2996 break;
2997 }
2998
2999 return std::nullopt;
3000}
3001
3003 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3005}
3006
3009 switch (K) {
3011 return TypeSize::getFixed(64);
3013 if (ST->useSVEForFixedLengthVectors() &&
3014 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3015 return TypeSize::getFixed(
3016 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3017 else if (ST->isNeonAvailable())
3018 return TypeSize::getFixed(128);
3019 else
3020 return TypeSize::getFixed(0);
3022 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3024 return TypeSize::getScalable(128);
3025 else
3026 return TypeSize::getScalable(0);
3027 }
3028 llvm_unreachable("Unsupported register kind");
3029}
3030
3031bool AArch64TTIImpl::isSingleExtWideningInstruction(
3032 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3033 Type *SrcOverrideTy) const {
3034 // A helper that returns a vector type from the given type. The number of
3035 // elements in type Ty determines the vector width.
3036 auto toVectorTy = [&](Type *ArgTy) {
3037 return VectorType::get(ArgTy->getScalarType(),
3038 cast<VectorType>(DstTy)->getElementCount());
3039 };
3040
3041 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3042 // i32, i64]. SVE doesn't generally have the same set of instructions to
3043 // perform an extend with the add/sub/mul. There are SMULLB style
3044 // instructions, but they operate on top/bottom, requiring some sort of lane
3045 // interleaving to be used with zext/sext.
3046 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3047 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3048 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3049 return false;
3050
3051 Type *SrcTy = SrcOverrideTy;
3052 switch (Opcode) {
3053 case Instruction::Add: // UADDW(2), SADDW(2).
3054 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3055 // The second operand needs to be an extend
3056 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3057 if (!SrcTy)
3058 SrcTy =
3059 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3060 break;
3061 }
3062
3063 if (Opcode == Instruction::Sub)
3064 return false;
3065
3066 // UADDW(2), SADDW(2) can be commutted.
3067 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3068 if (!SrcTy)
3069 SrcTy =
3070 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3071 break;
3072 }
3073 return false;
3074 }
3075 default:
3076 return false;
3077 }
3078
3079 // Legalize the destination type and ensure it can be used in a widening
3080 // operation.
3081 auto DstTyL = getTypeLegalizationCost(DstTy);
3082 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3083 return false;
3084
3085 // Legalize the source type and ensure it can be used in a widening
3086 // operation.
3087 assert(SrcTy && "Expected some SrcTy");
3088 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3089 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3090 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3091 return false;
3092
3093 // Get the total number of vector elements in the legalized types.
3094 InstructionCost NumDstEls =
3095 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3096 InstructionCost NumSrcEls =
3097 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3098
3099 // Return true if the legalized types have the same number of vector elements
3100 // and the destination element type size is twice that of the source type.
3101 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3102}
3103
3104Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3106 Type *SrcOverrideTy) const {
3107 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3108 Opcode != Instruction::Mul)
3109 return nullptr;
3110
3111 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3112 // i32, i64]. SVE doesn't generally have the same set of instructions to
3113 // perform an extend with the add/sub/mul. There are SMULLB style
3114 // instructions, but they operate on top/bottom, requiring some sort of lane
3115 // interleaving to be used with zext/sext.
3116 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3117 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3118 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3119 return nullptr;
3120
3121 auto getScalarSizeWithOverride = [&](const Value *V) {
3122 if (SrcOverrideTy)
3123 return SrcOverrideTy->getScalarSizeInBits();
3124 return cast<Instruction>(V)
3125 ->getOperand(0)
3126 ->getType()
3127 ->getScalarSizeInBits();
3128 };
3129
3130 unsigned MaxEltSize = 0;
3131 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3132 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3133 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3134 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3135 MaxEltSize = std::max(EltSize0, EltSize1);
3136 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3137 isa<SExtInst, ZExtInst>(Args[1])) {
3138 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3139 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3140 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3141 // enough.
3142 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3143 return nullptr;
3144 MaxEltSize = DstEltSize / 2;
3145 } else if (Opcode == Instruction::Mul &&
3146 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3147 // If one of the operands is a Zext and the other has enough zero bits
3148 // to be treated as unsigned, we can still generate a umull, meaning the
3149 // zext is free.
3150 KnownBits Known =
3151 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3152 if (Args[0]->getType()->getScalarSizeInBits() -
3153 Known.Zero.countLeadingOnes() >
3154 DstTy->getScalarSizeInBits() / 2)
3155 return nullptr;
3156
3157 MaxEltSize =
3158 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3159 } else
3160 return nullptr;
3161
3162 if (MaxEltSize * 2 > DstEltSize)
3163 return nullptr;
3164
3165 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3166 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3167 return nullptr;
3168 return ExtTy;
3169}
3170
3171// s/urhadd instructions implement the following pattern, making the
3172// extends free:
3173// %x = add ((zext i8 -> i16), 1)
3174// %y = (zext i8 -> i16)
3175// trunc i16 (lshr (add %x, %y), 1) -> i8
3176//
3178 Type *Src) const {
3179 // The source should be a legal vector type.
3180 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3181 (Src->isScalableTy() && !ST->hasSVE2()))
3182 return false;
3183
3184 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3185 return false;
3186
3187 // Look for trunc/shl/add before trying to match the pattern.
3188 const Instruction *Add = ExtUser;
3189 auto *AddUser =
3190 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3191 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3192 Add = AddUser;
3193
3194 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3195 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3196 return false;
3197
3198 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3199 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3200 Src->getScalarSizeInBits() !=
3201 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3202 return false;
3203
3204 // Try to match the whole pattern. Ext could be either the first or second
3205 // m_ZExtOrSExt matched.
3206 Instruction *Ex1, *Ex2;
3207 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3208 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
3209 return false;
3210
3211 // Ensure both extends are of the same type
3212 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3213 Ex1->getOpcode() == Ex2->getOpcode())
3214 return true;
3215
3216 return false;
3217}
3218
3220 Type *Src,
3223 const Instruction *I) const {
3224 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3225 assert(ISD && "Invalid opcode");
3226 // If the cast is observable, and it is used by a widening instruction (e.g.,
3227 // uaddl, saddw, etc.), it may be free.
3228 if (I && I->hasOneUser()) {
3229 auto *SingleUser = cast<Instruction>(*I->user_begin());
3230 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3231 if (Type *ExtTy = isBinExtWideningInstruction(
3232 SingleUser->getOpcode(), Dst, Operands,
3233 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3234 // The cost from Src->Src*2 needs to be added if required, the cost from
3235 // Src*2->ExtTy is free.
3236 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3237 Type *DoubleSrcTy =
3238 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3239 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3241 }
3242
3243 return 0;
3244 }
3245
3246 if (isSingleExtWideningInstruction(
3247 SingleUser->getOpcode(), Dst, Operands,
3248 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3249 // For adds only count the second operand as free if both operands are
3250 // extends but not the same operation. (i.e both operands are not free in
3251 // add(sext, zext)).
3252 if (SingleUser->getOpcode() == Instruction::Add) {
3253 if (I == SingleUser->getOperand(1) ||
3254 (isa<CastInst>(SingleUser->getOperand(1)) &&
3255 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3256 return 0;
3257 } else {
3258 // Others are free so long as isSingleExtWideningInstruction
3259 // returned true.
3260 return 0;
3261 }
3262 }
3263
3264 // The cast will be free for the s/urhadd instructions
3265 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3266 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3267 return 0;
3268 }
3269
3270 // TODO: Allow non-throughput costs that aren't binary.
3271 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3273 return Cost == 0 ? 0 : 1;
3274 return Cost;
3275 };
3276
3277 EVT SrcTy = TLI->getValueType(DL, Src);
3278 EVT DstTy = TLI->getValueType(DL, Dst);
3279
3280 if (!SrcTy.isSimple() || !DstTy.isSimple())
3281 return AdjustCost(
3282 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3283
3284 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3285 // we use fcvtx under SVE2. Give them invalid costs.
3286 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3287 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3288 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3290
3291 static const TypeConversionCostTblEntry BF16Tbl[] = {
3292 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3293 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3294 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3295 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3296 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3297 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3298 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3299 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3300 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3301 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3302 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3303 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3304 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3305 };
3306
3307 if (ST->hasBF16())
3308 if (const auto *Entry = ConvertCostTableLookup(
3309 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3310 return AdjustCost(Entry->Cost);
3311
3312 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3313 // The cost of unpacking twice is artificially increased for now in order
3314 // to avoid regressions against NEON, which will use tbl instructions directly
3315 // instead of multiple layers of [s|u]unpk[lo|hi].
3316 // We use the unpacks in cases where the destination type is illegal and
3317 // requires splitting of the input, even if the input type itself is legal.
3318 const unsigned int SVE_EXT_COST = 1;
3319 const unsigned int SVE_FCVT_COST = 1;
3320 const unsigned int SVE_UNPACK_ONCE = 4;
3321 const unsigned int SVE_UNPACK_TWICE = 16;
3322
3323 static const TypeConversionCostTblEntry ConversionTbl[] = {
3324 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3325 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3326 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3327 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3328 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3329 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3330 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3331 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3332 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3333 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3334 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3335 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3336 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3337 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3338 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3339 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3340 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3341 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3342 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3343 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3344
3345 // Truncations on nxvmiN
3346 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3347 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3348 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3349 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3350 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3351 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3352 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3353 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3354 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3355 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3356 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3357 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3358 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3359 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3360 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3361 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3362 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3363 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3364 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3365 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3366 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3367 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3368 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3369 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3370 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3371 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3372 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3373 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3374 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3375 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3376 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3377 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3378 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3379
3380 // The number of shll instructions for the extension.
3381 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3382 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3383 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3384 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3385 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3386 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3387 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3388 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3389 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3390 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3391 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3392 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3393 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3394 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3395 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3396 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3397
3398 // FP Ext and trunc
3399 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3400 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3401 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3402 // FP16
3403 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3404 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3405 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3406 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3407 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3408 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3409 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3410 // BF16 (uses shift)
3411 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3412 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3413 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3414 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3415 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3416 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3417 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3418 // FP Ext and trunc
3419 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3420 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3421 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3422 // FP16
3423 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3424 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3425 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3426 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3427 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3428 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3429 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3430 // BF16 (more complex, with +bf16 is handled above)
3431 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3432 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3433 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3434 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3435 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3436 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3437 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3438 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3439
3440 // LowerVectorINT_TO_FP:
3441 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3442 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3443 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3444 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3445 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3446 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3447
3448 // SVE: to nxv2f16
3449 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3450 SVE_EXT_COST + SVE_FCVT_COST},
3451 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3452 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3453 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3454 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3455 SVE_EXT_COST + SVE_FCVT_COST},
3456 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3457 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3458 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3459
3460 // SVE: to nxv4f16
3461 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3462 SVE_EXT_COST + SVE_FCVT_COST},
3463 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3464 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3465 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3466 SVE_EXT_COST + SVE_FCVT_COST},
3467 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3468 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3469
3470 // SVE: to nxv8f16
3471 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3472 SVE_EXT_COST + SVE_FCVT_COST},
3473 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3474 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3475 SVE_EXT_COST + SVE_FCVT_COST},
3476 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3477
3478 // SVE: to nxv16f16
3479 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3480 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3481 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3482 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3483
3484 // Complex: to v2f32
3485 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3486 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3487 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3488 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3489
3490 // SVE: to nxv2f32
3491 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3492 SVE_EXT_COST + SVE_FCVT_COST},
3493 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3494 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3495 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3496 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3497 SVE_EXT_COST + SVE_FCVT_COST},
3498 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3499 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3500 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3501
3502 // Complex: to v4f32
3503 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3504 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3505 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3506 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3507
3508 // SVE: to nxv4f32
3509 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3510 SVE_EXT_COST + SVE_FCVT_COST},
3511 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3512 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3513 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3514 SVE_EXT_COST + SVE_FCVT_COST},
3515 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3516 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3517
3518 // Complex: to v8f32
3519 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3520 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3521 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3522 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3523
3524 // SVE: to nxv8f32
3525 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3526 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3527 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3528 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3529 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3530 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3531 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3532 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3533
3534 // SVE: to nxv16f32
3535 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3536 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3537 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3538 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3539
3540 // Complex: to v16f32
3541 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3542 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3543
3544 // Complex: to v2f64
3545 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3546 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3547 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3548 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3549 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3550 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3551
3552 // SVE: to nxv2f64
3553 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3554 SVE_EXT_COST + SVE_FCVT_COST},
3555 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3556 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3557 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3558 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3559 SVE_EXT_COST + SVE_FCVT_COST},
3560 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3561 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3562 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3563
3564 // Complex: to v4f64
3565 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3566 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3567
3568 // SVE: to nxv4f64
3569 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3570 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3571 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3572 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3573 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3574 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3575 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3576 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3577 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3578 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3579 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3580 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3581
3582 // SVE: to nxv8f64
3583 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3584 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3585 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3586 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3587 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3588 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3589 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3590 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3591
3592 // LowerVectorFP_TO_INT
3593 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3594 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3595 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3596 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3597 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3598 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3599
3600 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3601 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3602 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3603 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3604 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3605 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3606 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3607
3608 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3609 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3610 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3611 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3612 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3613
3614 // Complex, from nxv2f32.
3615 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3616 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3617 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3618 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3619 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3620 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3621 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3622 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3623
3624 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3625 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3626 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3627 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3628 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3629 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3630 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3631
3632 // Complex, from nxv2f64.
3633 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3634 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3635 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3636 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3637 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3638 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3639 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3640 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3641 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3642 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3643
3644 // Complex, from nxv4f32.
3645 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3646 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3647 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3648 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3649 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3650 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3651 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3652 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3653 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3654 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3655
3656 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3657 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3658 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3659 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3660 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3661
3662 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3663 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3664 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3665 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3666 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3667 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3668 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3669
3670 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3671 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3672 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3673 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3674 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3675
3676 // Complex, from nxv8f16.
3677 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3678 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3679 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3680 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3681 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3682 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3683 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3684 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3685 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3686 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3687
3688 // Complex, from nxv4f16.
3689 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3690 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3691 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3692 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3693 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3694 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3695 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3696 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3697
3698 // Complex, from nxv2f16.
3699 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3700 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3701 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3702 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3703 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3704 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3705 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3706 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3707
3708 // Truncate from nxvmf32 to nxvmf16.
3709 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3710 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3711 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3712
3713 // Truncate from nxvmf32 to nxvmbf16.
3714 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3715 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
3716 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
3717
3718 // Truncate from nxvmf64 to nxvmf16.
3719 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3720 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3721 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3722
3723 // Truncate from nxvmf64 to nxvmbf16.
3724 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
3725 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
3726 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
3727
3728 // Truncate from nxvmf64 to nxvmf32.
3729 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3730 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3731 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3732
3733 // Extend from nxvmf16 to nxvmf32.
3734 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3735 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3736 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3737
3738 // Extend from nxvmbf16 to nxvmf32.
3739 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
3740 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
3741 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
3742
3743 // Extend from nxvmf16 to nxvmf64.
3744 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3745 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3746 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3747
3748 // Extend from nxvmbf16 to nxvmf64.
3749 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
3750 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
3751 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
3752
3753 // Extend from nxvmf32 to nxvmf64.
3754 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3755 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3756 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3757
3758 // Bitcasts from float to integer
3759 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3760 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3761 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3762
3763 // Bitcasts from integer to float
3764 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3765 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3766 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3767
3768 // Add cost for extending to illegal -too wide- scalable vectors.
3769 // zero/sign extend are implemented by multiple unpack operations,
3770 // where each operation has a cost of 1.
3771 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3772 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3773 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3774 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3775 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3776 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3777
3778 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3779 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3780 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3781 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3782 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3783 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3784 };
3785
3786 // We have to estimate a cost of fixed length operation upon
3787 // SVE registers(operations) with the number of registers required
3788 // for a fixed type to be represented upon SVE registers.
3789 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3790 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3791 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3792 ST->useSVEForFixedLengthVectors(WiderTy)) {
3793 std::pair<InstructionCost, MVT> LT =
3794 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3795 unsigned NumElements =
3796 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3797 return AdjustCost(
3798 LT.first *
3800 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3801 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3802 CostKind, I));
3803 }
3804
3805 if (const auto *Entry = ConvertCostTableLookup(
3806 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3807 return AdjustCost(Entry->Cost);
3808
3809 static const TypeConversionCostTblEntry FP16Tbl[] = {
3810 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3811 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3812 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3813 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3814 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3815 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3816 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3817 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3818 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3819 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3820 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3821 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3822 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3823 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3824 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3825 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3826 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3827 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3828 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3829 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3830 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3831 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3832 };
3833
3834 if (ST->hasFullFP16())
3835 if (const auto *Entry = ConvertCostTableLookup(
3836 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3837 return AdjustCost(Entry->Cost);
3838
3839 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3840 // double-rounding issues.
3841 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
3842 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
3844 return AdjustCost(
3846 getCastInstrCost(Opcode, Dst->getScalarType(), Src->getScalarType(),
3847 CCH, CostKind) +
3849 CostKind) +
3851 CostKind));
3852
3853 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3855 ST->isSVEorStreamingSVEAvailable() &&
3856 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3858 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3860 // The standard behaviour in the backend for these cases is to split the
3861 // extend up into two parts:
3862 // 1. Perform an extending load or masked load up to the legal type.
3863 // 2. Extend the loaded data to the final type.
3864 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3865 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3867 Opcode, LegalTy, Src, CCH, CostKind, I);
3869 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3870 return Part1 + Part2;
3871 }
3872
3873 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3874 // but we also want to include the TTI::CastContextHint::Masked case too.
3875 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3877 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3879
3880 return AdjustCost(
3881 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3882}
3883
3886 VectorType *VecTy, unsigned Index,
3888
3889 // Make sure we were given a valid extend opcode.
3890 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3891 "Invalid opcode");
3892
3893 // We are extending an element we extract from a vector, so the source type
3894 // of the extend is the element type of the vector.
3895 auto *Src = VecTy->getElementType();
3896
3897 // Sign- and zero-extends are for integer types only.
3898 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3899
3900 // Get the cost for the extract. We compute the cost (if any) for the extend
3901 // below.
3902 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3903 CostKind, Index, nullptr, nullptr);
3904
3905 // Legalize the types.
3906 auto VecLT = getTypeLegalizationCost(VecTy);
3907 auto DstVT = TLI->getValueType(DL, Dst);
3908 auto SrcVT = TLI->getValueType(DL, Src);
3909
3910 // If the resulting type is still a vector and the destination type is legal,
3911 // we may get the extension for free. If not, get the default cost for the
3912 // extend.
3913 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3914 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3915 CostKind);
3916
3917 // The destination type should be larger than the element type. If not, get
3918 // the default cost for the extend.
3919 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3920 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3921 CostKind);
3922
3923 switch (Opcode) {
3924 default:
3925 llvm_unreachable("Opcode should be either SExt or ZExt");
3926
3927 // For sign-extends, we only need a smov, which performs the extension
3928 // automatically.
3929 case Instruction::SExt:
3930 return Cost;
3931
3932 // For zero-extends, the extend is performed automatically by a umov unless
3933 // the destination type is i64 and the element type is i8 or i16.
3934 case Instruction::ZExt:
3935 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3936 return Cost;
3937 }
3938
3939 // If we are unable to perform the extend for free, get the default cost.
3940 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3941 CostKind);
3942}
3943
3946 const Instruction *I) const {
3948 return Opcode == Instruction::PHI ? 0 : 1;
3949 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3950 // Branches are assumed to be predicted.
3951 return 0;
3952}
3953
3954InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3955 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3956 const Instruction *I, Value *Scalar,
3957 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3958 assert(Val->isVectorTy() && "This must be a vector type");
3959
3960 if (Index != -1U) {
3961 // Legalize the type.
3962 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3963
3964 // This type is legalized to a scalar type.
3965 if (!LT.second.isVector())
3966 return 0;
3967
3968 // The type may be split. For fixed-width vectors we can normalize the
3969 // index to the new type.
3970 if (LT.second.isFixedLengthVector()) {
3971 unsigned Width = LT.second.getVectorNumElements();
3972 Index = Index % Width;
3973 }
3974
3975 // The element at index zero is already inside the vector.
3976 // - For a insert-element or extract-element
3977 // instruction that extracts integers, an explicit FPR -> GPR move is
3978 // needed. So it has non-zero cost.
3979 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
3980 return 0;
3981
3982 // This is recognising a LD1 single-element structure to one lane of one
3983 // register instruction. I.e., if this is an `insertelement` instruction,
3984 // and its second operand is a load, then we will generate a LD1, which
3985 // are expensive instructions.
3986 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3987 return CostKind == TTI::TCK_CodeSize
3988 ? 0
3990
3991 // i1 inserts and extract will include an extra cset or cmp of the vector
3992 // value. Increase the cost by 1 to account.
3993 if (Val->getScalarSizeInBits() == 1)
3994 return CostKind == TTI::TCK_CodeSize
3995 ? 2
3997
3998 // FIXME:
3999 // If the extract-element and insert-element instructions could be
4000 // simplified away (e.g., could be combined into users by looking at use-def
4001 // context), they have no cost. This is not done in the first place for
4002 // compile-time considerations.
4003 }
4004
4005 // In case of Neon, if there exists extractelement from lane != 0 such that
4006 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4007 // 2. extractelement result feeds into fmul.
4008 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4009 // equivalent to 0.
4010 // then the extractelement can be merged with fmul in the backend and it
4011 // incurs no cost.
4012 // e.g.
4013 // define double @foo(<2 x double> %a) {
4014 // %1 = extractelement <2 x double> %a, i32 0
4015 // %2 = extractelement <2 x double> %a, i32 1
4016 // %res = fmul double %1, %2
4017 // ret double %res
4018 // }
4019 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4020 auto ExtractCanFuseWithFmul = [&]() {
4021 // We bail out if the extract is from lane 0.
4022 if (Index == 0)
4023 return false;
4024
4025 // Check if the scalar element type of the vector operand of ExtractElement
4026 // instruction is one of the allowed types.
4027 auto IsAllowedScalarTy = [&](const Type *T) {
4028 return T->isFloatTy() || T->isDoubleTy() ||
4029 (T->isHalfTy() && ST->hasFullFP16());
4030 };
4031
4032 // Check if the extractelement user is scalar fmul.
4033 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4034 // Check if the user is scalar fmul.
4035 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4036 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4037 !BO->getType()->isVectorTy();
4038 };
4039
4040 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4041 // certain scalar type and a certain vector register width.
4042 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4043 auto RegWidth =
4045 .getFixedValue();
4046 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4047 };
4048
4049 // Check if the type constraints on input vector type and result scalar type
4050 // of extractelement instruction are satisfied.
4051 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4052 return false;
4053
4054 if (Scalar) {
4055 DenseMap<User *, unsigned> UserToExtractIdx;
4056 for (auto *U : Scalar->users()) {
4057 if (!IsUserFMulScalarTy(U))
4058 return false;
4059 // Recording entry for the user is important. Index value is not
4060 // important.
4061 UserToExtractIdx[U];
4062 }
4063 if (UserToExtractIdx.empty())
4064 return false;
4065 for (auto &[S, U, L] : ScalarUserAndIdx) {
4066 for (auto *U : S->users()) {
4067 if (UserToExtractIdx.contains(U)) {
4068 auto *FMul = cast<BinaryOperator>(U);
4069 auto *Op0 = FMul->getOperand(0);
4070 auto *Op1 = FMul->getOperand(1);
4071 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4072 UserToExtractIdx[U] = L;
4073 break;
4074 }
4075 }
4076 }
4077 }
4078 for (auto &[U, L] : UserToExtractIdx) {
4079 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4080 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4081 return false;
4082 }
4083 } else {
4084 const auto *EE = cast<ExtractElementInst>(I);
4085
4086 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4087 if (!IdxOp)
4088 return false;
4089
4090 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4091 if (!IsUserFMulScalarTy(U))
4092 return false;
4093
4094 // Check if the other operand of extractelement is also extractelement
4095 // from lane equivalent to 0.
4096 const auto *BO = cast<BinaryOperator>(U);
4097 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4098 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4099 if (OtherEE) {
4100 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4101 if (!IdxOp)
4102 return false;
4103 return IsExtractLaneEquivalentToZero(
4104 cast<ConstantInt>(OtherEE->getIndexOperand())
4105 ->getValue()
4106 .getZExtValue(),
4107 OtherEE->getType()->getScalarSizeInBits());
4108 }
4109 return true;
4110 });
4111 }
4112 return true;
4113 };
4114
4115 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4116 ExtractCanFuseWithFmul())
4117 return 0;
4118
4119 // All other insert/extracts cost this much.
4120 return CostKind == TTI::TCK_CodeSize ? 1
4121 : ST->getVectorInsertExtractBaseCost();
4122}
4123
4126 unsigned Index,
4127 const Value *Op0,
4128 const Value *Op1) const {
4129 // Treat insert at lane 0 into a poison vector as having zero cost. This
4130 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4131 // single dup) are treated as cheap.
4132 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4133 isa<PoisonValue>(Op0))
4134 return 0;
4135 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
4136}
4137
4139 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4140 Value *Scalar,
4141 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
4142 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4143 ScalarUserAndIdx);
4144}
4145
4147 Type *Val,
4149 unsigned Index) const {
4150 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
4151}
4152
4156 unsigned Index) const {
4157 if (isa<FixedVectorType>(Val))
4159 Index);
4160
4161 // This typically requires both while and lastb instructions in order
4162 // to extract the last element. If this is in a loop the while
4163 // instruction can at least be hoisted out, although it will consume a
4164 // predicate register. The cost should be more expensive than the base
4165 // extract cost, which is 2 for most CPUs.
4166 return CostKind == TTI::TCK_CodeSize
4167 ? 2
4168 : ST->getVectorInsertExtractBaseCost() + 1;
4169}
4170
4172 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4173 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
4174 ArrayRef<Value *> VL) const {
4177 if (Ty->getElementType()->isFloatingPointTy())
4178 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4179 CostKind);
4180 unsigned VecInstCost =
4181 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4182 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4183}
4184
4185std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4187 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4188 std::function<InstructionCost(Type *)> InstCost) const {
4189 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4190 return std::nullopt;
4191 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4192 return std::nullopt;
4193 if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
4194 ST->isNonStreamingSVEorSME2Available())
4195 return std::nullopt;
4196
4197 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4198 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4200 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4201 Cost *= 2;
4202 Cost += InstCost(PromotedTy);
4203 if (IncludeTrunc)
4204 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4206 return Cost;
4207}
4208
4210 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4212 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4213
4214 // The code-generator is currently not able to handle scalable vectors
4215 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4216 // it. This change will be removed when code-generation for these types is
4217 // sufficiently reliable.
4218 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4219 if (VTy->getElementCount() == ElementCount::getScalable(1))
4221
4222 // TODO: Handle more cost kinds.
4224 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4225 Op2Info, Args, CxtI);
4226
4227 // Legalize the type.
4228 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4229 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4230
4231 // Increase the cost for half and bfloat types if not architecturally
4232 // supported.
4233 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4234 ISD == ISD::FDIV || ISD == ISD::FREM)
4235 if (auto PromotedCost = getFP16BF16PromoteCost(
4236 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4237 // There is not native support for fdiv/frem even with +sve-b16b16.
4238 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4239 [&](Type *PromotedTy) {
4240 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4241 Op1Info, Op2Info);
4242 }))
4243 return *PromotedCost;
4244
4245 // If the operation is a widening instruction (smull or umull) and both
4246 // operands are extends the cost can be cheaper by considering that the
4247 // operation will operate on the narrowest type size possible (double the
4248 // largest input size) and a further extend.
4249 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4250 if (ExtTy != Ty)
4251 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4252 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4254 return LT.first;
4255 }
4256
4257 switch (ISD) {
4258 default:
4259 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4260 Op2Info);
4261 case ISD::SREM:
4262 case ISD::SDIV:
4263 /*
4264 Notes for sdiv/srem specific costs:
4265 1. This only considers the cases where the divisor is constant, uniform and
4266 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4267 result in some form of (ldr + adrp), corresponding to constant vectors, or
4268 scalarization of the division operation.
4269 2. Constant divisors, either negative in whole or partially, don't result in
4270 significantly different codegen as compared to positive constant divisors.
4271 So, we don't consider negative divisors separately.
4272 3. If the codegen is significantly different with SVE, it has been indicated
4273 using comments at appropriate places.
4274
4275 sdiv specific cases:
4276 -----------------------------------------------------------------------
4277 codegen | pow-of-2 | Type
4278 -----------------------------------------------------------------------
4279 add + cmp + csel + asr | Y | i64
4280 add + cmp + csel + asr | Y | i32
4281 -----------------------------------------------------------------------
4282
4283 srem specific cases:
4284 -----------------------------------------------------------------------
4285 codegen | pow-of-2 | Type
4286 -----------------------------------------------------------------------
4287 negs + and + and + csneg | Y | i64
4288 negs + and + and + csneg | Y | i32
4289 -----------------------------------------------------------------------
4290
4291 other sdiv/srem cases:
4292 -------------------------------------------------------------------------
4293 common codegen | + srem | + sdiv | pow-of-2 | Type
4294 -------------------------------------------------------------------------
4295 smulh + asr + add + add | - | - | N | i64
4296 smull + lsr + add + add | - | - | N | i32
4297 usra | and + sub | sshr | Y | <2 x i64>
4298 2 * (scalar code) | - | - | N | <2 x i64>
4299 usra | bic + sub | sshr + neg | Y | <4 x i32>
4300 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4301 + sshr + usra | | | |
4302 -------------------------------------------------------------------------
4303 */
4304 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4305 InstructionCost AddCost =
4306 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4307 Op1Info.getNoProps(), Op2Info.getNoProps());
4308 InstructionCost AsrCost =
4309 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4310 Op1Info.getNoProps(), Op2Info.getNoProps());
4311 InstructionCost MulCost =
4312 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4313 Op1Info.getNoProps(), Op2Info.getNoProps());
4314 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4315 // have similar cost.
4316 auto VT = TLI->getValueType(DL, Ty);
4317 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4318 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4319 // Neg can be folded into the asr instruction.
4320 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4321 : (3 * AsrCost + AddCost);
4322 } else {
4323 return MulCost + AsrCost + 2 * AddCost;
4324 }
4325 } else if (VT.isVector()) {
4326 InstructionCost UsraCost = 2 * AsrCost;
4327 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4328 // Division with scalable types corresponds to native 'asrd'
4329 // instruction when SVE is available.
4330 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4331
4332 // One more for the negation in SDIV
4334 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4335 if (Ty->isScalableTy() && ST->hasSVE())
4336 Cost += 2 * AsrCost;
4337 else {
4338 Cost +=
4339 UsraCost +
4340 (ISD == ISD::SDIV
4341 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4342 : 2 * AddCost);
4343 }
4344 return Cost;
4345 } else if (LT.second == MVT::v2i64) {
4346 return VT.getVectorNumElements() *
4347 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4348 Op1Info.getNoProps(),
4349 Op2Info.getNoProps());
4350 } else {
4351 // When SVE is available, we get:
4352 // smulh + lsr + add/sub + asr + add/sub.
4353 if (Ty->isScalableTy() && ST->hasSVE())
4354 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4355 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4356 }
4357 }
4358 }
4359 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4360 LT.second.isFixedLengthVector()) {
4361 // FIXME: When the constant vector is non-uniform, this may result in
4362 // loading the vector from constant pool or in some cases, may also result
4363 // in scalarization. For now, we are approximating this with the
4364 // scalarization cost.
4365 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4366 CostKind, -1, nullptr, nullptr);
4367 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4368 CostKind, -1, nullptr, nullptr);
4369 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4370 return ExtractCost + InsertCost +
4371 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4372 CostKind, Op1Info.getNoProps(),
4373 Op2Info.getNoProps());
4374 }
4375 [[fallthrough]];
4376 case ISD::UDIV:
4377 case ISD::UREM: {
4378 auto VT = TLI->getValueType(DL, Ty);
4379 if (Op2Info.isConstant()) {
4380 // If the operand is a power of 2 we can use the shift or and cost.
4381 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4382 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4383 Op1Info.getNoProps(),
4384 Op2Info.getNoProps());
4385 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4386 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4387 Op1Info.getNoProps(),
4388 Op2Info.getNoProps());
4389
4390 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4391 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4392 // The MULHU will be expanded to UMULL for the types not listed below,
4393 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4394 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4395 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4396 LT.second == MVT::nxv16i8;
4397 bool Is128bit = LT.second.is128BitVector();
4398
4399 InstructionCost MulCost =
4400 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4401 Op1Info.getNoProps(), Op2Info.getNoProps());
4402 InstructionCost AddCost =
4403 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4404 Op1Info.getNoProps(), Op2Info.getNoProps());
4405 InstructionCost ShrCost =
4406 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4407 Op1Info.getNoProps(), Op2Info.getNoProps());
4408 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4409 (HasMULH ? 0 : ShrCost) + // UMULL shift
4410 AddCost * 2 + ShrCost;
4411 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4412 }
4413 }
4414
4415 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4416 // emitted by the backend even when those functions are not declared in the
4417 // module.
4418 if (!VT.isVector() && VT.getSizeInBits() > 64)
4419 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4420
4422 Opcode, Ty, CostKind, Op1Info, Op2Info);
4423 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4424 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4425 // SDIV/UDIV operations are lowered using SVE, then we can have less
4426 // costs.
4427 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4428 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4429 static const CostTblEntry DivTbl[]{
4430 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4431 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4432 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4433 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4434 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4435 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4436
4437 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4438 if (nullptr != Entry)
4439 return Entry->Cost;
4440 }
4441 // For 8/16-bit elements, the cost is higher because the type
4442 // requires promotion and possibly splitting:
4443 if (LT.second.getScalarType() == MVT::i8)
4444 Cost *= 8;
4445 else if (LT.second.getScalarType() == MVT::i16)
4446 Cost *= 4;
4447 return Cost;
4448 } else {
4449 // If one of the operands is a uniform constant then the cost for each
4450 // element is Cost for insertion, extraction and division.
4451 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4452 // operation with scalar type
4453 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4454 (Op2Info.isConstant() && Op2Info.isUniform())) {
4455 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4457 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4458 return (4 + DivCost) * VTy->getNumElements();
4459 }
4460 }
4461 // On AArch64, without SVE, vector divisions are expanded
4462 // into scalar divisions of each pair of elements.
4463 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4464 -1, nullptr, nullptr);
4465 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4466 nullptr, nullptr);
4467 }
4468
4469 // TODO: if one of the arguments is scalar, then it's not necessary to
4470 // double the cost of handling the vector elements.
4471 Cost += Cost;
4472 }
4473 return Cost;
4474 }
4475 case ISD::MUL:
4476 // When SVE is available, then we can lower the v2i64 operation using
4477 // the SVE mul instruction, which has a lower cost.
4478 if (LT.second == MVT::v2i64 && ST->hasSVE())
4479 return LT.first;
4480
4481 // When SVE is not available, there is no MUL.2d instruction,
4482 // which means mul <2 x i64> is expensive as elements are extracted
4483 // from the vectors and the muls scalarized.
4484 // As getScalarizationOverhead is a bit too pessimistic, we
4485 // estimate the cost for a i64 vector directly here, which is:
4486 // - four 2-cost i64 extracts,
4487 // - two 2-cost i64 inserts, and
4488 // - two 1-cost muls.
4489 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4490 // LT.first = 2 the cost is 28.
4491 if (LT.second != MVT::v2i64)
4492 return LT.first;
4493 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4494 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4495 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4496 nullptr, nullptr) *
4497 2 +
4498 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4499 nullptr, nullptr));
4500 case ISD::ADD:
4501 case ISD::XOR:
4502 case ISD::OR:
4503 case ISD::AND:
4504 case ISD::SRL:
4505 case ISD::SRA:
4506 case ISD::SHL:
4507 // These nodes are marked as 'custom' for combining purposes only.
4508 // We know that they are legal. See LowerAdd in ISelLowering.
4509 return LT.first;
4510
4511 case ISD::FNEG:
4512 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4513 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4514 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4515 CxtI &&
4516 ((CxtI->hasOneUse() &&
4517 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4518 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4519 return 0;
4520 [[fallthrough]];
4521 case ISD::FADD:
4522 case ISD::FSUB:
4523 if (!Ty->getScalarType()->isFP128Ty())
4524 return LT.first;
4525 [[fallthrough]];
4526 case ISD::FMUL:
4527 case ISD::FDIV:
4528 // These nodes are marked as 'custom' just to lower them to SVE.
4529 // We know said lowering will incur no additional cost.
4530 if (!Ty->getScalarType()->isFP128Ty())
4531 return 2 * LT.first;
4532
4533 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4534 Op2Info);
4535 case ISD::FREM:
4536 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4537 // those functions are not declared in the module.
4538 if (!Ty->isVectorTy())
4539 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4540 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4541 Op2Info);
4542 }
4543}
4544
4547 const SCEV *Ptr,
4549 // Address computations in vectorized code with non-consecutive addresses will
4550 // likely result in more instructions compared to scalar code where the
4551 // computation can more often be merged into the index mode. The resulting
4552 // extra micro-ops can significantly decrease throughput.
4553 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4554 int MaxMergeDistance = 64;
4555
4556 if (PtrTy->isVectorTy() && SE &&
4557 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4558 return NumVectorInstToHideOverhead;
4559
4560 // In many cases the address computation is not merged into the instruction
4561 // addressing mode.
4562 return 1;
4563}
4564
4565/// Check whether Opcode1 has less throughput according to the scheduling
4566/// model than Opcode2.
4568 unsigned Opcode1, unsigned Opcode2) const {
4569 const MCSchedModel &Sched = ST->getSchedModel();
4570 const TargetInstrInfo *TII = ST->getInstrInfo();
4571 if (!Sched.hasInstrSchedModel())
4572 return false;
4573
4574 const MCSchedClassDesc *SCD1 =
4575 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4576 const MCSchedClassDesc *SCD2 =
4577 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4578 // We cannot handle variant scheduling classes without an MI. If we need to
4579 // support them for any of the instructions we query the information of we
4580 // might need to add a way to resolve them without a MI or not use the
4581 // scheduling info.
4582 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4583 "Cannot handle variant scheduling classes without an MI");
4584 if (!SCD1->isValid() || !SCD2->isValid())
4585 return false;
4586
4587 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4589}
4590
4592 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4594 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4595 // We don't lower some vector selects well that are wider than the register
4596 // width. TODO: Improve this with different cost kinds.
4597 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4598 // We would need this many instructions to hide the scalarization happening.
4599 const int AmortizationCost = 20;
4600
4601 // If VecPred is not set, check if we can get a predicate from the context
4602 // instruction, if its type matches the requested ValTy.
4603 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4604 CmpPredicate CurrentPred;
4605 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4606 m_Value())))
4607 VecPred = CurrentPred;
4608 }
4609 // Check if we have a compare/select chain that can be lowered using
4610 // a (F)CMxx & BFI pair.
4611 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4612 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4613 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4614 VecPred == CmpInst::FCMP_UNE) {
4615 static const auto ValidMinMaxTys = {
4616 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4617 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4618 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4619
4620 auto LT = getTypeLegalizationCost(ValTy);
4621 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
4622 (ST->hasFullFP16() &&
4623 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
4624 return LT.first;
4625 }
4626
4627 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4628 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4629 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4630 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4631 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4632 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4633 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4634 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4635 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4636 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4637 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4638 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4639
4640 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4641 EVT SelValTy = TLI->getValueType(DL, ValTy);
4642 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4643 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4644 SelCondTy.getSimpleVT(),
4645 SelValTy.getSimpleVT()))
4646 return Entry->Cost;
4647 }
4648 }
4649
4650 if (Opcode == Instruction::FCmp) {
4651 if (auto PromotedCost = getFP16BF16PromoteCost(
4652 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4653 // TODO: Consider costing SVE FCMPs.
4654 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4656 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4657 CostKind, Op1Info, Op2Info);
4658 if (isa<VectorType>(PromotedTy))
4660 Instruction::Trunc,
4664 return Cost;
4665 }))
4666 return *PromotedCost;
4667
4668 auto LT = getTypeLegalizationCost(ValTy);
4669 // Model unknown fp compares as a libcall.
4670 if (LT.second.getScalarType() != MVT::f64 &&
4671 LT.second.getScalarType() != MVT::f32 &&
4672 LT.second.getScalarType() != MVT::f16)
4673 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4674 {ValTy, ValTy}, CostKind);
4675
4676 // Some comparison operators require expanding to multiple compares + or.
4677 unsigned Factor = 1;
4678 if (!CondTy->isVectorTy() &&
4679 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4680 Factor = 2; // fcmp with 2 selects
4681 else if (isa<FixedVectorType>(ValTy) &&
4682 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4683 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4684 Factor = 3; // fcmxx+fcmyy+or
4685 else if (isa<ScalableVectorType>(ValTy) &&
4686 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4687 Factor = 3; // fcmxx+fcmyy+or
4688
4689 if (isa<ScalableVectorType>(ValTy) &&
4691 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4692 AArch64::FCMEQv4f32))
4693 Factor *= 2;
4694
4695 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4696 }
4697
4698 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4699 // icmp(and, 0) as free, as we can make use of ands, but only if the
4700 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4701 // providing it will not cause performance regressions.
4702 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4703 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4704 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4705 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4706 if (match(I->getOperand(1), m_Zero()))
4707 return 0;
4708
4709 // x >= 1 / x < 1 -> x > 0 / x <= 0
4710 if (match(I->getOperand(1), m_One()) &&
4711 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4712 return 0;
4713
4714 // x <= -1 / x > -1 -> x > 0 / x <= 0
4715 if (match(I->getOperand(1), m_AllOnes()) &&
4716 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4717 return 0;
4718 }
4719
4720 // The base case handles scalable vectors fine for now, since it treats the
4721 // cost as 1 * legalization cost.
4722 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4723 Op1Info, Op2Info, I);
4724}
4725
4727AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4729 if (ST->requiresStrictAlign()) {
4730 // TODO: Add cost modeling for strict align. Misaligned loads expand to
4731 // a bunch of instructions when strict align is enabled.
4732 return Options;
4733 }
4734 Options.AllowOverlappingLoads = true;
4735 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4736 Options.NumLoadsPerBlock = Options.MaxNumLoads;
4737 // TODO: Though vector loads usually perform well on AArch64, in some targets
4738 // they may wake up the FP unit, which raises the power consumption. Perhaps
4739 // they could be used with no holds barred (-O3).
4740 Options.LoadSizes = {8, 4, 2, 1};
4741 Options.AllowedTailExpansions = {3, 5, 6};
4742 return Options;
4743}
4744
4746 return ST->hasSVE();
4747}
4748
4752 Type *Src = MICA.getDataType();
4753
4754 if (useNeonVector(Src))
4756 auto LT = getTypeLegalizationCost(Src);
4757 if (!LT.first.isValid())
4759
4760 // Return an invalid cost for element types that we are unable to lower.
4761 auto *VT = cast<VectorType>(Src);
4762 if (VT->getElementType()->isIntegerTy(1))
4764
4765 // The code-generator is currently not able to handle scalable vectors
4766 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4767 // it. This change will be removed when code-generation for these types is
4768 // sufficiently reliable.
4769 if (VT->getElementCount() == ElementCount::getScalable(1))
4771
4772 return LT.first;
4773}
4774
4775// This function returns gather/scatter overhead either from
4776// user-provided value or specialized values per-target from \p ST.
4777static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4778 const AArch64Subtarget *ST) {
4779 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
4780 "Should be called on only load or stores.");
4781 switch (Opcode) {
4782 case Instruction::Load:
4783 if (SVEGatherOverhead.getNumOccurrences() > 0)
4784 return SVEGatherOverhead;
4785 return ST->getGatherOverhead();
4786 break;
4787 case Instruction::Store:
4788 if (SVEScatterOverhead.getNumOccurrences() > 0)
4789 return SVEScatterOverhead;
4790 return ST->getScatterOverhead();
4791 break;
4792 default:
4793 llvm_unreachable("Shouldn't have reached here");
4794 }
4795}
4796
4798 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
4799 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
4800 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
4801 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4802 Alignment, CostKind, I);
4803 auto *VT = cast<VectorType>(DataTy);
4804 auto LT = getTypeLegalizationCost(DataTy);
4805 if (!LT.first.isValid())
4807
4808 // Return an invalid cost for element types that we are unable to lower.
4809 if (!LT.second.isVector() ||
4810 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
4811 VT->getElementType()->isIntegerTy(1))
4813
4814 // The code-generator is currently not able to handle scalable vectors
4815 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4816 // it. This change will be removed when code-generation for these types is
4817 // sufficiently reliable.
4818 if (VT->getElementCount() == ElementCount::getScalable(1))
4820
4821 ElementCount LegalVF = LT.second.getVectorElementCount();
4822 InstructionCost MemOpCost =
4823 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
4824 {TTI::OK_AnyValue, TTI::OP_None}, I);
4825 // Add on an overhead cost for using gathers/scatters.
4826 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4827 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
4828}
4829
4831 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
4832}
4833
4835 Align Alignment,
4836 unsigned AddressSpace,
4838 TTI::OperandValueInfo OpInfo,
4839 const Instruction *I) const {
4840 EVT VT = TLI->getValueType(DL, Ty, true);
4841 // Type legalization can't handle structs
4842 if (VT == MVT::Other)
4843 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
4844 CostKind);
4845
4846 auto LT = getTypeLegalizationCost(Ty);
4847 if (!LT.first.isValid())
4849
4850 // The code-generator is currently not able to handle scalable vectors
4851 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4852 // it. This change will be removed when code-generation for these types is
4853 // sufficiently reliable.
4854 // We also only support full register predicate loads and stores.
4855 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4856 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
4857 (VTy->getElementType()->isIntegerTy(1) &&
4858 !VTy->getElementCount().isKnownMultipleOf(
4861
4862 // TODO: consider latency as well for TCK_SizeAndLatency.
4864 return LT.first;
4865
4867 return 1;
4868
4869 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4870 LT.second.is128BitVector() && Alignment < Align(16)) {
4871 // Unaligned stores are extremely inefficient. We don't split all
4872 // unaligned 128-bit stores because the negative impact that has shown in
4873 // practice on inlined block copy code.
4874 // We make such stores expensive so that we will only vectorize if there
4875 // are 6 other instructions getting vectorized.
4876 const int AmortizationCost = 6;
4877
4878 return LT.first * 2 * AmortizationCost;
4879 }
4880
4881 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4882 if (Ty->isPtrOrPtrVectorTy())
4883 return LT.first;
4884
4885 if (useNeonVector(Ty)) {
4886 // Check truncating stores and extending loads.
4887 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4888 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
4889 if (VT == MVT::v4i8)
4890 return 2;
4891 // Otherwise we need to scalarize.
4892 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
4893 }
4894 EVT EltVT = VT.getVectorElementType();
4895 unsigned EltSize = EltVT.getScalarSizeInBits();
4896 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
4897 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
4898 return LT.first;
4899 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
4900 // widening to v4i8, which produces suboptimal results.
4901 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
4902 return LT.first;
4903
4904 // Check non-power-of-2 loads/stores for legal vector element types with
4905 // NEON. Non-power-of-2 memory ops will get broken down to a set of
4906 // operations on smaller power-of-2 ops, including ld1/st1.
4907 LLVMContext &C = Ty->getContext();
4909 SmallVector<EVT> TypeWorklist;
4910 TypeWorklist.push_back(VT);
4911 while (!TypeWorklist.empty()) {
4912 EVT CurrVT = TypeWorklist.pop_back_val();
4913 unsigned CurrNumElements = CurrVT.getVectorNumElements();
4914 if (isPowerOf2_32(CurrNumElements)) {
4915 Cost += 1;
4916 continue;
4917 }
4918
4919 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
4920 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
4921 TypeWorklist.push_back(
4922 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
4923 }
4924 return Cost;
4925 }
4926
4927 return LT.first;
4928}
4929
4931 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
4932 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4933 bool UseMaskForCond, bool UseMaskForGaps) const {
4934 assert(Factor >= 2 && "Invalid interleave factor");
4935 auto *VecVTy = cast<VectorType>(VecTy);
4936
4937 if (VecTy->isScalableTy() && !ST->hasSVE())
4939
4940 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4941 // only have lowering for power-of-2 factors.
4942 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4943 // InterleavedAccessPass for ld3/st3
4944 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
4946
4947 // Vectorization for masked interleaved accesses is only enabled for scalable
4948 // VF.
4949 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4951
4952 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4953 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4954 auto *SubVecTy =
4955 VectorType::get(VecVTy->getElementType(),
4956 VecVTy->getElementCount().divideCoefficientBy(Factor));
4957
4958 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4959 // Accesses having vector types that are a multiple of 128 bits can be
4960 // matched to more than one ldN/stN instruction.
4961 bool UseScalable;
4962 if (MinElts % Factor == 0 &&
4963 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4964 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4965 }
4966
4967 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4968 Alignment, AddressSpace, CostKind,
4969 UseMaskForCond, UseMaskForGaps);
4970}
4971
4976 for (auto *I : Tys) {
4977 if (!I->isVectorTy())
4978 continue;
4979 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4980 128)
4981 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4982 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4983 }
4984 return Cost;
4985}
4986
4988 return ST->getMaxInterleaveFactor();
4989}
4990
4991// For Falkor, we want to avoid having too many strided loads in a loop since
4992// that can exhaust the HW prefetcher resources. We adjust the unroller
4993// MaxCount preference below to attempt to ensure unrolling doesn't create too
4994// many strided loads.
4995static void
4998 enum { MaxStridedLoads = 7 };
4999 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5000 int StridedLoads = 0;
5001 // FIXME? We could make this more precise by looking at the CFG and
5002 // e.g. not counting loads in each side of an if-then-else diamond.
5003 for (const auto BB : L->blocks()) {
5004 for (auto &I : *BB) {
5005 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5006 if (!LMemI)
5007 continue;
5008
5009 Value *PtrValue = LMemI->getPointerOperand();
5010 if (L->isLoopInvariant(PtrValue))
5011 continue;
5012
5013 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5014 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5015 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5016 continue;
5017
5018 // FIXME? We could take pairing of unrolled load copies into account
5019 // by looking at the AddRec, but we would probably have to limit this
5020 // to loops with no stores or other memory optimization barriers.
5021 ++StridedLoads;
5022 // We've seen enough strided loads that seeing more won't make a
5023 // difference.
5024 if (StridedLoads > MaxStridedLoads / 2)
5025 return StridedLoads;
5026 }
5027 }
5028 return StridedLoads;
5029 };
5030
5031 int StridedLoads = countStridedLoads(L, SE);
5032 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5033 << " strided loads\n");
5034 // Pick the largest power of 2 unroll count that won't result in too many
5035 // strided loads.
5036 if (StridedLoads) {
5037 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5038 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5039 << UP.MaxCount << '\n');
5040 }
5041}
5042
5043// This function returns true if the loop:
5044// 1. Has a valid cost, and
5045// 2. Has a cost within the supplied budget.
5046// Otherwise it returns false.
5048 InstructionCost Budget,
5049 unsigned *FinalSize) {
5050 // Estimate the size of the loop.
5051 InstructionCost LoopCost = 0;
5052
5053 for (auto *BB : L->getBlocks()) {
5054 for (auto &I : *BB) {
5055 SmallVector<const Value *, 4> Operands(I.operand_values());
5056 InstructionCost Cost =
5057 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5058 // This can happen with intrinsics that don't currently have a cost model
5059 // or for some operations that require SVE.
5060 if (!Cost.isValid())
5061 return false;
5062
5063 LoopCost += Cost;
5064 if (LoopCost > Budget)
5065 return false;
5066 }
5067 }
5068
5069 if (FinalSize)
5070 *FinalSize = LoopCost.getValue();
5071 return true;
5072}
5073
5075 const AArch64TTIImpl &TTI) {
5076 // Only consider loops with unknown trip counts for which we can determine
5077 // a symbolic expression. Multi-exit loops with small known trip counts will
5078 // likely be unrolled anyway.
5079 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5081 return false;
5082
5083 // It might not be worth unrolling loops with low max trip counts. Restrict
5084 // this to max trip counts > 32 for now.
5085 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5086 if (MaxTC > 0 && MaxTC <= 32)
5087 return false;
5088
5089 // Make sure the loop size is <= 5.
5090 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5091 return false;
5092
5093 // Small search loops with multiple exits can be highly beneficial to unroll.
5094 // We only care about loops with exactly two exiting blocks, although each
5095 // block could jump to the same exit block.
5096 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5097 if (Blocks.size() != 2)
5098 return false;
5099
5100 if (any_of(Blocks, [](BasicBlock *BB) {
5101 return !isa<BranchInst>(BB->getTerminator());
5102 }))
5103 return false;
5104
5105 return true;
5106}
5107
5108/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5109/// OOO engine's wide instruction window and various predictors.
5110static void
5113 const AArch64TTIImpl &TTI) {
5114 // Limit loops with structure that is highly likely to benefit from runtime
5115 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5116 // likely with complex control flow). Note that the heuristics here may be
5117 // overly conservative and we err on the side of avoiding runtime unrolling
5118 // rather than unroll excessively. They are all subject to further refinement.
5119 if (!L->isInnermost() || L->getNumBlocks() > 8)
5120 return;
5121
5122 // Loops with multiple exits are handled by common code.
5123 if (!L->getExitBlock())
5124 return;
5125
5126 // Check if the loop contains any reductions that could be parallelized when
5127 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5128 // a multiple of 2.
5129 bool HasParellelizableReductions =
5130 L->getNumBlocks() == 1 &&
5131 any_of(L->getHeader()->phis(),
5132 [&SE, L](PHINode &Phi) {
5133 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5134 }) &&
5135 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5136 if (HasParellelizableReductions &&
5137 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5138 UP.Partial = true;
5139 UP.MaxCount = 4;
5140 UP.AddAdditionalAccumulators = true;
5141 }
5142
5143 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5145 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5146 SE.getSmallConstantMaxTripCount(L) <= 32))
5147 return;
5148
5149 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5150 return;
5151
5153 return;
5154
5155 // Limit to loops with trip counts that are cheap to expand.
5156 UP.SCEVExpansionBudget = 1;
5157
5158 if (HasParellelizableReductions) {
5159 UP.Runtime = true;
5161 UP.AddAdditionalAccumulators = true;
5162 }
5163
5164 // Try to unroll small loops, of few-blocks with low budget, if they have
5165 // load/store dependencies, to expose more parallel memory access streams,
5166 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5167 BasicBlock *Header = L->getHeader();
5168 BasicBlock *Latch = L->getLoopLatch();
5169 if (Header == Latch) {
5170 // Estimate the size of the loop.
5171 unsigned Size;
5172 unsigned Width = 10;
5173 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5174 return;
5175
5176 // Try to find an unroll count that maximizes the use of the instruction
5177 // window, i.e. trying to fetch as many instructions per cycle as possible.
5178 unsigned MaxInstsPerLine = 16;
5179 unsigned UC = 1;
5180 unsigned BestUC = 1;
5181 unsigned SizeWithBestUC = BestUC * Size;
5182 while (UC <= 8) {
5183 unsigned SizeWithUC = UC * Size;
5184 if (SizeWithUC > 48)
5185 break;
5186 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5187 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5188 BestUC = UC;
5189 SizeWithBestUC = BestUC * Size;
5190 }
5191 UC++;
5192 }
5193
5194 if (BestUC == 1)
5195 return;
5196
5197 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5199 for (auto *BB : L->blocks()) {
5200 for (auto &I : *BB) {
5202 if (!Ptr)
5203 continue;
5204 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5205 if (SE.isLoopInvariant(PtrSCEV, L))
5206 continue;
5207 if (isa<LoadInst>(&I)) {
5208 LoadedValuesPlus.insert(&I);
5209 // Include in-loop 1st users of loaded values.
5210 for (auto *U : I.users())
5211 if (L->contains(cast<Instruction>(U)))
5212 LoadedValuesPlus.insert(U);
5213 } else
5214 Stores.push_back(cast<StoreInst>(&I));
5215 }
5216 }
5217
5218 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5219 return LoadedValuesPlus.contains(SI->getOperand(0));
5220 }))
5221 return;
5222
5223 UP.Runtime = true;
5224 UP.DefaultUnrollRuntimeCount = BestUC;
5225 return;
5226 }
5227
5228 // Try to runtime-unroll loops with early-continues depending on loop-varying
5229 // loads; this helps with branch-prediction for the early-continues.
5230 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
5232 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
5233 !llvm::is_contained(Preds, Header) ||
5234 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5235 return;
5236
5237 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5238 [&](Instruction *I, unsigned Depth) -> bool {
5239 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5240 return false;
5241
5242 if (isa<LoadInst>(I))
5243 return true;
5244
5245 return any_of(I->operands(), [&](Value *V) {
5246 auto *I = dyn_cast<Instruction>(V);
5247 return I && DependsOnLoopLoad(I, Depth + 1);
5248 });
5249 };
5250 CmpPredicate Pred;
5251 Instruction *I;
5252 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5253 m_Value())) &&
5254 DependsOnLoopLoad(I, 0)) {
5255 UP.Runtime = true;
5256 }
5257}
5258
5261 OptimizationRemarkEmitter *ORE) const {
5262 // Enable partial unrolling and runtime unrolling.
5263 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5264
5265 UP.UpperBound = true;
5266
5267 // For inner loop, it is more likely to be a hot one, and the runtime check
5268 // can be promoted out from LICM pass, so the overhead is less, let's try
5269 // a larger threshold to unroll more loops.
5270 if (L->getLoopDepth() > 1)
5271 UP.PartialThreshold *= 2;
5272
5273 // Disable partial & runtime unrolling on -Os.
5275
5276 // Scan the loop: don't unroll loops with calls as this could prevent
5277 // inlining. Don't unroll auto-vectorized loops either, though do allow
5278 // unrolling of the scalar remainder.
5279 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5281 for (auto *BB : L->getBlocks()) {
5282 for (auto &I : *BB) {
5283 // Both auto-vectorized loops and the scalar remainder have the
5284 // isvectorized attribute, so differentiate between them by the presence
5285 // of vector instructions.
5286 if (IsVectorized && I.getType()->isVectorTy())
5287 return;
5288 if (isa<CallBase>(I)) {
5291 if (!isLoweredToCall(F))
5292 continue;
5293 return;
5294 }
5295
5296 SmallVector<const Value *, 4> Operands(I.operand_values());
5297 Cost += getInstructionCost(&I, Operands,
5299 }
5300 }
5301
5302 // Apply subtarget-specific unrolling preferences.
5303 switch (ST->getProcFamily()) {
5304 case AArch64Subtarget::AppleA14:
5305 case AArch64Subtarget::AppleA15:
5306 case AArch64Subtarget::AppleA16:
5307 case AArch64Subtarget::AppleM4:
5308 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5309 break;
5310 case AArch64Subtarget::Falkor:
5313 break;
5314 default:
5315 break;
5316 }
5317
5318 // If this is a small, multi-exit loop similar to something like std::find,
5319 // then there is typically a performance improvement achieved by unrolling.
5320 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5321 UP.RuntimeUnrollMultiExit = true;
5322 UP.Runtime = true;
5323 // Limit unroll count.
5325 // Allow slightly more costly trip-count expansion to catch search loops
5326 // with pointer inductions.
5327 UP.SCEVExpansionBudget = 5;
5328 return;
5329 }
5330
5331 // Enable runtime unrolling for in-order models
5332 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5333 // checking for that case, we can ensure that the default behaviour is
5334 // unchanged
5335 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5336 !ST->getSchedModel().isOutOfOrder()) {
5337 UP.Runtime = true;
5338 UP.Partial = true;
5339 UP.UnrollRemainder = true;
5341
5342 UP.UnrollAndJam = true;
5344 }
5345
5346 // Force unrolling small loops can be very useful because of the branch
5347 // taken cost of the backedge.
5349 UP.Force = true;
5350}
5351
5356
5358 Type *ExpectedType,
5359 bool CanCreate) const {
5360 switch (Inst->getIntrinsicID()) {
5361 default:
5362 return nullptr;
5363 case Intrinsic::aarch64_neon_st2:
5364 case Intrinsic::aarch64_neon_st3:
5365 case Intrinsic::aarch64_neon_st4: {
5366 // Create a struct type
5367 StructType *ST = dyn_cast<StructType>(ExpectedType);
5368 if (!CanCreate || !ST)
5369 return nullptr;
5370 unsigned NumElts = Inst->arg_size() - 1;
5371 if (ST->getNumElements() != NumElts)
5372 return nullptr;
5373 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5374 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5375 return nullptr;
5376 }
5377 Value *Res = PoisonValue::get(ExpectedType);
5378 IRBuilder<> Builder(Inst);
5379 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5380 Value *L = Inst->getArgOperand(i);
5381 Res = Builder.CreateInsertValue(Res, L, i);
5382 }
5383 return Res;
5384 }
5385 case Intrinsic::aarch64_neon_ld2:
5386 case Intrinsic::aarch64_neon_ld3:
5387 case Intrinsic::aarch64_neon_ld4:
5388 if (Inst->getType() == ExpectedType)
5389 return Inst;
5390 return nullptr;
5391 }
5392}
5393
5395 MemIntrinsicInfo &Info) const {
5396 switch (Inst->getIntrinsicID()) {
5397 default:
5398 break;
5399 case Intrinsic::aarch64_neon_ld2:
5400 case Intrinsic::aarch64_neon_ld3:
5401 case Intrinsic::aarch64_neon_ld4:
5402 Info.ReadMem = true;
5403 Info.WriteMem = false;
5404 Info.PtrVal = Inst->getArgOperand(0);
5405 break;
5406 case Intrinsic::aarch64_neon_st2:
5407 case Intrinsic::aarch64_neon_st3:
5408 case Intrinsic::aarch64_neon_st4:
5409 Info.ReadMem = false;
5410 Info.WriteMem = true;
5411 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5412 break;
5413 }
5414
5415 switch (Inst->getIntrinsicID()) {
5416 default:
5417 return false;
5418 case Intrinsic::aarch64_neon_ld2:
5419 case Intrinsic::aarch64_neon_st2:
5420 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5421 break;
5422 case Intrinsic::aarch64_neon_ld3:
5423 case Intrinsic::aarch64_neon_st3:
5424 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5425 break;
5426 case Intrinsic::aarch64_neon_ld4:
5427 case Intrinsic::aarch64_neon_st4:
5428 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5429 break;
5430 }
5431 return true;
5432}
5433
5434/// See if \p I should be considered for address type promotion. We check if \p
5435/// I is a sext with right type and used in memory accesses. If it used in a
5436/// "complex" getelementptr, we allow it to be promoted without finding other
5437/// sext instructions that sign extended the same initial value. A getelementptr
5438/// is considered as "complex" if it has more than 2 operands.
5440 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5441 bool Considerable = false;
5442 AllowPromotionWithoutCommonHeader = false;
5443 if (!isa<SExtInst>(&I))
5444 return false;
5445 Type *ConsideredSExtType =
5446 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5447 if (I.getType() != ConsideredSExtType)
5448 return false;
5449 // See if the sext is the one with the right type and used in at least one
5450 // GetElementPtrInst.
5451 for (const User *U : I.users()) {
5452 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5453 Considerable = true;
5454 // A getelementptr is considered as "complex" if it has more than 2
5455 // operands. We will promote a SExt used in such complex GEP as we
5456 // expect some computation to be merged if they are done on 64 bits.
5457 if (GEPInst->getNumOperands() > 2) {
5458 AllowPromotionWithoutCommonHeader = true;
5459 break;
5460 }
5461 }
5462 }
5463 return Considerable;
5464}
5465
5467 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5468 if (!VF.isScalable())
5469 return true;
5470
5471 Type *Ty = RdxDesc.getRecurrenceType();
5472 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5473 return false;
5474
5475 switch (RdxDesc.getRecurrenceKind()) {
5476 case RecurKind::Sub:
5478 case RecurKind::Add:
5479 case RecurKind::FAdd:
5480 case RecurKind::And:
5481 case RecurKind::Or:
5482 case RecurKind::Xor:
5483 case RecurKind::SMin:
5484 case RecurKind::SMax:
5485 case RecurKind::UMin:
5486 case RecurKind::UMax:
5487 case RecurKind::FMin:
5488 case RecurKind::FMax:
5489 case RecurKind::FMulAdd:
5490 case RecurKind::AnyOf:
5491 return true;
5492 default:
5493 return false;
5494 }
5495}
5496
5499 FastMathFlags FMF,
5501 // The code-generator is currently not able to handle scalable vectors
5502 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5503 // it. This change will be removed when code-generation for these types is
5504 // sufficiently reliable.
5505 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5506 if (VTy->getElementCount() == ElementCount::getScalable(1))
5508
5509 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5510
5511 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5512 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5513
5514 InstructionCost LegalizationCost = 0;
5515 if (LT.first > 1) {
5516 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5517 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5518 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5519 }
5520
5521 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5522}
5523
5525 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5526 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5527 InstructionCost LegalizationCost = 0;
5528 if (LT.first > 1) {
5529 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5530 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5531 LegalizationCost *= LT.first - 1;
5532 }
5533
5534 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5535 assert(ISD && "Invalid opcode");
5536 // Add the final reduction cost for the legal horizontal reduction
5537 switch (ISD) {
5538 case ISD::ADD:
5539 case ISD::AND:
5540 case ISD::OR:
5541 case ISD::XOR:
5542 case ISD::FADD:
5543 return LegalizationCost + 2;
5544 default:
5546 }
5547}
5548
5551 std::optional<FastMathFlags> FMF,
5553 // The code-generator is currently not able to handle scalable vectors
5554 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5555 // it. This change will be removed when code-generation for these types is
5556 // sufficiently reliable.
5557 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5558 if (VTy->getElementCount() == ElementCount::getScalable(1))
5560
5562 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5563 InstructionCost BaseCost =
5564 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5565 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5566 // end up vectorizing for more computationally intensive loops.
5567 return BaseCost + FixedVTy->getNumElements();
5568 }
5569
5570 if (Opcode != Instruction::FAdd)
5572
5573 auto *VTy = cast<ScalableVectorType>(ValTy);
5575 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5576 Cost *= getMaxNumElements(VTy->getElementCount());
5577 return Cost;
5578 }
5579
5580 if (isa<ScalableVectorType>(ValTy))
5581 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5582
5583 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5584 MVT MTy = LT.second;
5585 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5586 assert(ISD && "Invalid opcode");
5587
5588 // Horizontal adds can use the 'addv' instruction. We model the cost of these
5589 // instructions as twice a normal vector add, plus 1 for each legalization
5590 // step (LT.first). This is the only arithmetic vector reduction operation for
5591 // which we have an instruction.
5592 // OR, XOR and AND costs should match the codegen from:
5593 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5594 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5595 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5596 static const CostTblEntry CostTblNoPairwise[]{
5597 {ISD::ADD, MVT::v8i8, 2},
5598 {ISD::ADD, MVT::v16i8, 2},
5599 {ISD::ADD, MVT::v4i16, 2},
5600 {ISD::ADD, MVT::v8i16, 2},
5601 {ISD::ADD, MVT::v2i32, 2},
5602 {ISD::ADD, MVT::v4i32, 2},
5603 {ISD::ADD, MVT::v2i64, 2},
5604 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
5605 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
5606 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
5607 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
5608 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
5609 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
5610 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
5611 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
5612 {ISD::XOR, MVT::v16i8, 7},
5613 {ISD::XOR, MVT::v4i16, 4},
5614 {ISD::XOR, MVT::v8i16, 6},
5615 {ISD::XOR, MVT::v2i32, 3},
5616 {ISD::XOR, MVT::v4i32, 5},
5617 {ISD::XOR, MVT::v2i64, 3},
5618 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
5619 {ISD::AND, MVT::v16i8, 7},
5620 {ISD::AND, MVT::v4i16, 4},
5621 {ISD::AND, MVT::v8i16, 6},
5622 {ISD::AND, MVT::v2i32, 3},
5623 {ISD::AND, MVT::v4i32, 5},
5624 {ISD::AND, MVT::v2i64, 3},
5625 };
5626 switch (ISD) {
5627 default:
5628 break;
5629 case ISD::FADD:
5630 if (Type *EltTy = ValTy->getScalarType();
5631 // FIXME: For half types without fullfp16 support, this could extend and
5632 // use a fp32 faddp reduction but current codegen unrolls.
5633 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
5634 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
5635 const unsigned NElts = MTy.getVectorNumElements();
5636 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
5637 isPowerOf2_32(NElts))
5638 // Reduction corresponding to series of fadd instructions is lowered to
5639 // series of faddp instructions. faddp has latency/throughput that
5640 // matches fadd instruction and hence, every faddp instruction can be
5641 // considered to have a relative cost = 1 with
5642 // CostKind = TCK_RecipThroughput.
5643 // An faddp will pairwise add vector elements, so the size of input
5644 // vector reduces by half every time, requiring
5645 // #(faddp instructions) = log2_32(NElts).
5646 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
5647 }
5648 break;
5649 case ISD::ADD:
5650 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
5651 return (LT.first - 1) + Entry->Cost;
5652 break;
5653 case ISD::XOR:
5654 case ISD::AND:
5655 case ISD::OR:
5656 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
5657 if (!Entry)
5658 break;
5659 auto *ValVTy = cast<FixedVectorType>(ValTy);
5660 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5661 isPowerOf2_32(ValVTy->getNumElements())) {
5662 InstructionCost ExtraCost = 0;
5663 if (LT.first != 1) {
5664 // Type needs to be split, so there is an extra cost of LT.first - 1
5665 // arithmetic ops.
5666 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
5667 MTy.getVectorNumElements());
5668 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5669 ExtraCost *= LT.first - 1;
5670 }
5671 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5672 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
5673 return Cost + ExtraCost;
5674 }
5675 break;
5676 }
5677 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5678}
5679
5681 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
5682 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5683 EVT VecVT = TLI->getValueType(DL, VecTy);
5684 EVT ResVT = TLI->getValueType(DL, ResTy);
5685
5686 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5687 VecVT.getSizeInBits() >= 64) {
5688 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5689
5690 // The legal cases are:
5691 // UADDLV 8/16/32->32
5692 // UADDLP 32->64
5693 unsigned RevVTSize = ResVT.getSizeInBits();
5694 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5695 RevVTSize <= 32) ||
5696 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
5697 RevVTSize <= 32) ||
5698 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
5699 RevVTSize <= 64))
5700 return (LT.first - 1) * 2 + 2;
5701 }
5702
5703 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
5704 CostKind);
5705}
5706
5708AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5709 Type *ResTy, VectorType *VecTy,
5711 EVT VecVT = TLI->getValueType(DL, VecTy);
5712 EVT ResVT = TLI->getValueType(DL, ResTy);
5713
5714 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5715 RedOpcode == Instruction::Add) {
5716 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
5717
5718 // The legal cases with dotprod are
5719 // UDOT 8->32
5720 // Which requires an additional uaddv to sum the i32 values.
5721 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
5722 ResVT == MVT::i32)
5723 return LT.first + 2;
5724 }
5725
5726 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
5727 CostKind);
5728}
5729
5733 static const CostTblEntry ShuffleTbl[] = {
5734 { TTI::SK_Splice, MVT::nxv16i8, 1 },
5735 { TTI::SK_Splice, MVT::nxv8i16, 1 },
5736 { TTI::SK_Splice, MVT::nxv4i32, 1 },
5737 { TTI::SK_Splice, MVT::nxv2i64, 1 },
5738 { TTI::SK_Splice, MVT::nxv2f16, 1 },
5739 { TTI::SK_Splice, MVT::nxv4f16, 1 },
5740 { TTI::SK_Splice, MVT::nxv8f16, 1 },
5741 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
5742 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
5743 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
5744 { TTI::SK_Splice, MVT::nxv2f32, 1 },
5745 { TTI::SK_Splice, MVT::nxv4f32, 1 },
5746 { TTI::SK_Splice, MVT::nxv2f64, 1 },
5747 };
5748
5749 // The code-generator is currently not able to handle scalable vectors
5750 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5751 // it. This change will be removed when code-generation for these types is
5752 // sufficiently reliable.
5755
5756 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
5757 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
5758 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5759 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
5760 : LT.second;
5761 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
5762 InstructionCost LegalizationCost = 0;
5763 if (Index < 0) {
5764 LegalizationCost =
5765 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
5767 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
5769 }
5770
5771 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5772 // Cost performed on a promoted type.
5773 if (LT.second.getScalarType() == MVT::i1) {
5774 LegalizationCost +=
5775 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
5777 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
5779 }
5780 const auto *Entry =
5781 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
5782 assert(Entry && "Illegal Type for Splice");
5783 LegalizationCost += Entry->Cost;
5784 return LegalizationCost * LT.first;
5785}
5786
5788 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
5790 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5793
5795 return Invalid;
5796
5797 if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5798 (!ST->isNeonAvailable() || !ST->hasDotProd()))
5799 return Invalid;
5800
5801 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
5802 OpAExtend == TTI::PR_None)
5803 return Invalid;
5804
5805 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5806 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
5807 "Unexpected values for OpBExtend or InputTypeB");
5808
5809 // We only support multiply binary operations for now, and for muls we
5810 // require the types being extended to be the same.
5811 if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
5812 return Invalid;
5813
5814 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5815 if (IsUSDot && !ST->hasMatMulInt8())
5816 return Invalid;
5817
5818 unsigned Ratio =
5819 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5820 if (VF.getKnownMinValue() <= Ratio)
5821 return Invalid;
5822
5823 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
5824 VectorType *AccumVectorType =
5825 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
5826 // We don't yet support all kinds of legalization.
5827 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
5828 EVT::getEVT(AccumVectorType));
5829 switch (TC.first) {
5830 default:
5831 return Invalid;
5835 // The legalised type (e.g. after splitting) must be legal too.
5836 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
5838 return Invalid;
5839 break;
5840 }
5841
5842 std::pair<InstructionCost, MVT> AccumLT =
5843 getTypeLegalizationCost(AccumVectorType);
5844 std::pair<InstructionCost, MVT> InputLT =
5845 getTypeLegalizationCost(InputVectorType);
5846
5847 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5848
5849 // Prefer using full types by costing half-full input types as more expensive.
5850 if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
5852 // FIXME: This can be removed after the cost of the extends are folded into
5853 // the dot-product expression in VPlan, after landing:
5854 // https://github.com/llvm/llvm-project/pull/147302
5855 Cost *= 2;
5856
5857 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
5858 // i16 -> i64 is natively supported for udot/sdot
5859 if (AccumLT.second.getScalarType() == MVT::i64 &&
5860 InputLT.second.getScalarType() == MVT::i16)
5861 return Cost;
5862 // i8 -> i64 is supported with an extra level of extends
5863 if (AccumLT.second.getScalarType() == MVT::i64 &&
5864 InputLT.second.getScalarType() == MVT::i8)
5865 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
5866 // because it requires two extra extends on the inputs. But if we'd change
5867 // that now, a regular reduction would be cheaper because the costs of
5868 // the extends in the IR are still counted. This can be fixed
5869 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
5870 return Cost;
5871 }
5872
5873 // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
5874 if (ST->isSVEorStreamingSVEAvailable() ||
5875 (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
5876 ST->hasDotProd())) {
5877 if (AccumLT.second.getScalarType() == MVT::i32 &&
5878 InputLT.second.getScalarType() == MVT::i8)
5879 return Cost;
5880 }
5881
5882 // Add additional cost for the extends that would need to be inserted.
5883 return Cost + 2;
5884}
5885
5888 VectorType *SrcTy, ArrayRef<int> Mask,
5889 TTI::TargetCostKind CostKind, int Index,
5891 const Instruction *CxtI) const {
5892 assert((Mask.empty() || DstTy->isScalableTy() ||
5893 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5894 "Expected the Mask to match the return size if given");
5895 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5896 "Expected the same scalar types");
5897 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
5898
5899 // If we have a Mask, and the LT is being legalized somehow, split the Mask
5900 // into smaller vectors and sum the cost of each shuffle.
5901 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
5902 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
5903 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5904 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5905 // Check for LD3/LD4 instructions, which are represented in llvm IR as
5906 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5907 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
5908 // cost than just the load.
5909 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
5912 return std::max<InstructionCost>(1, LT.first / 4);
5913
5914 // Check for ST3/ST4 instructions, which are represented in llvm IR as
5915 // store(interleaving-shuffle). The shuffle cost could potentially be free,
5916 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
5917 // cost than just the store.
5918 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
5920 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
5922 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
5923 return LT.first;
5924
5925 unsigned TpNumElts = Mask.size();
5926 unsigned LTNumElts = LT.second.getVectorNumElements();
5927 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
5928 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
5929 LT.second.getVectorElementCount());
5931 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5932 PreviousCosts;
5933 for (unsigned N = 0; N < NumVecs; N++) {
5934 SmallVector<int> NMask;
5935 // Split the existing mask into chunks of size LTNumElts. Track the source
5936 // sub-vectors to ensure the result has at most 2 inputs.
5937 unsigned Source1 = -1U, Source2 = -1U;
5938 unsigned NumSources = 0;
5939 for (unsigned E = 0; E < LTNumElts; E++) {
5940 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
5942 if (MaskElt < 0) {
5944 continue;
5945 }
5946
5947 // Calculate which source from the input this comes from and whether it
5948 // is new to us.
5949 unsigned Source = MaskElt / LTNumElts;
5950 if (NumSources == 0) {
5951 Source1 = Source;
5952 NumSources = 1;
5953 } else if (NumSources == 1 && Source != Source1) {
5954 Source2 = Source;
5955 NumSources = 2;
5956 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
5957 NumSources++;
5958 }
5959
5960 // Add to the new mask. For the NumSources>2 case these are not correct,
5961 // but are only used for the modular lane number.
5962 if (Source == Source1)
5963 NMask.push_back(MaskElt % LTNumElts);
5964 else if (Source == Source2)
5965 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
5966 else
5967 NMask.push_back(MaskElt % LTNumElts);
5968 }
5969 // Check if we have already generated this sub-shuffle, which means we
5970 // will have already generated the output. For example a <16 x i32> splat
5971 // will be the same sub-splat 4 times, which only needs to be generated
5972 // once and reused.
5973 auto Result =
5974 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
5975 // Check if it was already in the map (already costed).
5976 if (!Result.second)
5977 continue;
5978 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
5979 // getShuffleCost. If not then cost it using the worst case as the number
5980 // of element moves into a new vector.
5981 InstructionCost NCost =
5982 NumSources <= 2
5983 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
5985 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
5986 CxtI)
5987 : LTNumElts;
5988 Result.first->second = NCost;
5989 Cost += NCost;
5990 }
5991 return Cost;
5992 }
5993
5994 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
5995 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5996 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
5997 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
5998 // This currently only handles low or high extracts to prevent SLP vectorizer
5999 // regressions.
6000 // Note that SVE's ext instruction is destructive, but it can be fused with
6001 // a movprfx to act like a constructive instruction.
6002 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6003 if (LT.second.getFixedSizeInBits() >= 128 &&
6004 cast<FixedVectorType>(SubTp)->getNumElements() ==
6005 LT.second.getVectorNumElements() / 2) {
6006 if (Index == 0)
6007 return 0;
6008 if (Index == (int)LT.second.getVectorNumElements() / 2)
6009 return 1;
6010 }
6012 }
6013 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6014 // the code to handle length-changing shuffles.
6015 if (Kind == TTI::SK_InsertSubvector) {
6016 LT = getTypeLegalizationCost(DstTy);
6017 SrcTy = DstTy;
6018 }
6019
6020 // Check for identity masks, which we can treat as free for both fixed and
6021 // scalable vector paths.
6022 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6023 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6024 all_of(enumerate(Mask), [](const auto &M) {
6025 return M.value() < 0 || M.value() == (int)M.index();
6026 }))
6027 return 0;
6028
6029 // Segmented shuffle matching.
6030 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6031 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6032 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6034
6036 unsigned Segments =
6038 unsigned SegmentElts = VTy->getNumElements() / Segments;
6039
6040 // dupq zd.t, zn.t[idx]
6041 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6042 ST->isSVEorStreamingSVEAvailable() &&
6043 isDUPQMask(Mask, Segments, SegmentElts))
6044 return LT.first;
6045
6046 // mov zd.q, vn
6047 if (ST->isSVEorStreamingSVEAvailable() &&
6048 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6049 return LT.first;
6050 }
6051
6052 // Check for broadcast loads, which are supported by the LD1R instruction.
6053 // In terms of code-size, the shuffle vector is free when a load + dup get
6054 // folded into a LD1R. That's what we check and return here. For performance
6055 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6056 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6057 // that we model the load + dup sequence slightly higher because LD1R is a
6058 // high latency instruction.
6059 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6060 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6061 if (IsLoad && LT.second.isVector() &&
6062 isLegalBroadcastLoad(SrcTy->getElementType(),
6063 LT.second.getVectorElementCount()))
6064 return 0;
6065 }
6066
6067 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6068 // from the perfect shuffle tables.
6069 if (Mask.size() == 4 &&
6070 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6071 (SrcTy->getScalarSizeInBits() == 16 ||
6072 SrcTy->getScalarSizeInBits() == 32) &&
6073 all_of(Mask, [](int E) { return E < 8; }))
6074 return getPerfectShuffleCost(Mask);
6075
6076 // Check for other shuffles that are not SK_ kinds but we have native
6077 // instructions for, for example ZIP and UZP.
6078 unsigned Unused;
6079 if (LT.second.isFixedLengthVector() &&
6080 LT.second.getVectorNumElements() == Mask.size() &&
6081 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6082 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6083 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6084 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6085 LT.second.getVectorNumElements(), 16) ||
6086 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6087 LT.second.getVectorNumElements(), 32) ||
6088 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6089 LT.second.getVectorNumElements(), 64) ||
6090 // Check for non-zero lane splats
6091 all_of(drop_begin(Mask),
6092 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6093 return 1;
6094
6095 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6096 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6097 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6098 static const CostTblEntry ShuffleTbl[] = {
6099 // Broadcast shuffle kinds can be performed with 'dup'.
6100 {TTI::SK_Broadcast, MVT::v8i8, 1},
6101 {TTI::SK_Broadcast, MVT::v16i8, 1},
6102 {TTI::SK_Broadcast, MVT::v4i16, 1},
6103 {TTI::SK_Broadcast, MVT::v8i16, 1},
6104 {TTI::SK_Broadcast, MVT::v2i32, 1},
6105 {TTI::SK_Broadcast, MVT::v4i32, 1},
6106 {TTI::SK_Broadcast, MVT::v2i64, 1},
6107 {TTI::SK_Broadcast, MVT::v4f16, 1},
6108 {TTI::SK_Broadcast, MVT::v8f16, 1},
6109 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6110 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6111 {TTI::SK_Broadcast, MVT::v2f32, 1},
6112 {TTI::SK_Broadcast, MVT::v4f32, 1},
6113 {TTI::SK_Broadcast, MVT::v2f64, 1},
6114 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6115 // 'zip1/zip2' instructions.
6116 {TTI::SK_Transpose, MVT::v8i8, 1},
6117 {TTI::SK_Transpose, MVT::v16i8, 1},
6118 {TTI::SK_Transpose, MVT::v4i16, 1},
6119 {TTI::SK_Transpose, MVT::v8i16, 1},
6120 {TTI::SK_Transpose, MVT::v2i32, 1},
6121 {TTI::SK_Transpose, MVT::v4i32, 1},
6122 {TTI::SK_Transpose, MVT::v2i64, 1},
6123 {TTI::SK_Transpose, MVT::v4f16, 1},
6124 {TTI::SK_Transpose, MVT::v8f16, 1},
6125 {TTI::SK_Transpose, MVT::v4bf16, 1},
6126 {TTI::SK_Transpose, MVT::v8bf16, 1},
6127 {TTI::SK_Transpose, MVT::v2f32, 1},
6128 {TTI::SK_Transpose, MVT::v4f32, 1},
6129 {TTI::SK_Transpose, MVT::v2f64, 1},
6130 // Select shuffle kinds.
6131 // TODO: handle vXi8/vXi16.
6132 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6133 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6134 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6135 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6136 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6137 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6138 // PermuteSingleSrc shuffle kinds.
6139 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6140 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6141 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6142 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6143 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6144 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6145 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6146 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6147 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6148 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6149 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6150 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6151 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6152 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6153 // Reverse can be lowered with `rev`.
6154 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6155 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6156 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6157 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6158 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6159 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6160 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6161 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6162 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6163 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6164 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6165 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6166 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6167 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6168 // Splice can all be lowered as `ext`.
6169 {TTI::SK_Splice, MVT::v2i32, 1},
6170 {TTI::SK_Splice, MVT::v4i32, 1},
6171 {TTI::SK_Splice, MVT::v2i64, 1},
6172 {TTI::SK_Splice, MVT::v2f32, 1},
6173 {TTI::SK_Splice, MVT::v4f32, 1},
6174 {TTI::SK_Splice, MVT::v2f64, 1},
6175 {TTI::SK_Splice, MVT::v8f16, 1},
6176 {TTI::SK_Splice, MVT::v8bf16, 1},
6177 {TTI::SK_Splice, MVT::v8i16, 1},
6178 {TTI::SK_Splice, MVT::v16i8, 1},
6179 {TTI::SK_Splice, MVT::v4f16, 1},
6180 {TTI::SK_Splice, MVT::v4bf16, 1},
6181 {TTI::SK_Splice, MVT::v4i16, 1},
6182 {TTI::SK_Splice, MVT::v8i8, 1},
6183 // Broadcast shuffle kinds for scalable vectors
6184 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6185 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6186 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6187 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6188 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6189 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6190 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6191 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6192 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6193 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6194 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6195 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6196 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6197 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6198 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6199 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6200 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6201 // Handle the cases for vector.reverse with scalable vectors
6202 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6203 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6204 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6205 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6206 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6207 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6208 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6209 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6210 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6211 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6212 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6213 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6214 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6215 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6216 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6217 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6218 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6219 };
6220 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6221 return LT.first * Entry->Cost;
6222 }
6223
6224 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6225 return getSpliceCost(SrcTy, Index, CostKind);
6226
6227 // Inserting a subvector can often be done with either a D, S or H register
6228 // move, so long as the inserted vector is "aligned".
6229 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6230 LT.second.getSizeInBits() <= 128 && SubTp) {
6231 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6232 if (SubLT.second.isVector()) {
6233 int NumElts = LT.second.getVectorNumElements();
6234 int NumSubElts = SubLT.second.getVectorNumElements();
6235 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6236 return SubLT.first;
6237 }
6238 }
6239
6240 // Restore optimal kind.
6241 if (IsExtractSubvector)
6243 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6244 Args, CxtI);
6245}
6246
6249 const DominatorTree &DT) {
6250 const auto &Strides = DenseMap<Value *, const SCEV *>();
6251 for (BasicBlock *BB : TheLoop->blocks()) {
6252 // Scan the instructions in the block and look for addresses that are
6253 // consecutive and decreasing.
6254 for (Instruction &I : *BB) {
6255 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6257 Type *AccessTy = getLoadStoreType(&I);
6258 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6259 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6260 .value_or(0) < 0)
6261 return true;
6262 }
6263 }
6264 }
6265 return false;
6266}
6267
6269 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6271 // For cases like post-LTO vectorization, when we eventually know the trip
6272 // count, epilogue with fixed-width vectorization can be deleted if the trip
6273 // count is less than the epilogue iterations. That's why we prefer
6274 // fixed-width vectorization in epilogue in case of equal costs.
6275 if (IsEpilogue)
6276 return true;
6277 return ST->useFixedOverScalableIfEqualCost();
6278}
6279
6281 return ST->getEpilogueVectorizationMinVF();
6282}
6283
6285 if (!ST->hasSVE())
6286 return false;
6287
6288 // We don't currently support vectorisation with interleaving for SVE - with
6289 // such loops we're better off not using tail-folding. This gives us a chance
6290 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6291 if (TFI->IAI->hasGroups())
6292 return false;
6293
6295 if (TFI->LVL->getReductionVars().size())
6297 if (TFI->LVL->getFixedOrderRecurrences().size())
6299
6300 // We call this to discover whether any load/store pointers in the loop have
6301 // negative strides. This will require extra work to reverse the loop
6302 // predicate, which may be expensive.
6305 *TFI->LVL->getDominatorTree()))
6309
6310 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6311 Required))
6312 return false;
6313
6314 // Don't tail-fold for tight loops where we would be better off interleaving
6315 // with an unpredicated loop.
6316 unsigned NumInsns = 0;
6317 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6318 NumInsns += BB->sizeWithoutDebug();
6319 }
6320
6321 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6322 return NumInsns >= SVETailFoldInsnThreshold;
6323}
6324
6327 StackOffset BaseOffset, bool HasBaseReg,
6328 int64_t Scale, unsigned AddrSpace) const {
6329 // Scaling factors are not free at all.
6330 // Operands | Rt Latency
6331 // -------------------------------------------
6332 // Rt, [Xn, Xm] | 4
6333 // -------------------------------------------
6334 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6335 // Rt, [Xn, Wm, <extend> #imm] |
6337 AM.BaseGV = BaseGV;
6338 AM.BaseOffs = BaseOffset.getFixed();
6339 AM.HasBaseReg = HasBaseReg;
6340 AM.Scale = Scale;
6341 AM.ScalableOffset = BaseOffset.getScalable();
6342 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6343 // Scale represents reg2 * scale, thus account for 1 if
6344 // it is not equal to 0 or 1.
6345 return AM.Scale != 0 && AM.Scale != 1;
6347}
6348
6350 const Instruction *I) const {
6352 // For the binary operators (e.g. or) we need to be more careful than
6353 // selects, here we only transform them if they are already at a natural
6354 // break point in the code - the end of a block with an unconditional
6355 // terminator.
6356 if (I->getOpcode() == Instruction::Or &&
6357 isa<BranchInst>(I->getNextNode()) &&
6358 cast<BranchInst>(I->getNextNode())->isUnconditional())
6359 return true;
6360
6361 if (I->getOpcode() == Instruction::Add ||
6362 I->getOpcode() == Instruction::Sub)
6363 return true;
6364 }
6366}
6367
6370 const TargetTransformInfo::LSRCost &C2) const {
6371 // AArch64 specific here is adding the number of instructions to the
6372 // comparison (though not as the first consideration, as some targets do)
6373 // along with changing the priority of the base additions.
6374 // TODO: Maybe a more nuanced tradeoff between instruction count
6375 // and number of registers? To be investigated at a later date.
6376 if (EnableLSRCostOpt)
6377 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6378 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6379 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6380 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6381
6383}
6384
6385static bool isSplatShuffle(Value *V) {
6386 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6387 return all_equal(Shuf->getShuffleMask());
6388 return false;
6389}
6390
6391/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6392/// or upper half of the vector elements.
6393static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6394 bool AllowSplat = false) {
6395 // Scalable types can't be extract shuffle vectors.
6396 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6397 return false;
6398
6399 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6400 auto *FullTy = FullV->getType();
6401 auto *HalfTy = HalfV->getType();
6402 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6403 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6404 };
6405
6406 auto extractHalf = [](Value *FullV, Value *HalfV) {
6407 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6408 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6409 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6410 };
6411
6412 ArrayRef<int> M1, M2;
6413 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6414 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6415 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6416 return false;
6417
6418 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6419 // it is not checked as an extract below.
6420 if (AllowSplat && isSplatShuffle(Op1))
6421 S1Op1 = nullptr;
6422 if (AllowSplat && isSplatShuffle(Op2))
6423 S2Op1 = nullptr;
6424
6425 // Check that the operands are half as wide as the result and we extract
6426 // half of the elements of the input vectors.
6427 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6428 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6429 return false;
6430
6431 // Check the mask extracts either the lower or upper half of vector
6432 // elements.
6433 int M1Start = 0;
6434 int M2Start = 0;
6435 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6436 if ((S1Op1 &&
6437 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6438 (S2Op1 &&
6439 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6440 return false;
6441
6442 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6443 (M2Start != 0 && M2Start != (NumElements / 2)))
6444 return false;
6445 if (S1Op1 && S2Op1 && M1Start != M2Start)
6446 return false;
6447
6448 return true;
6449}
6450
6451/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6452/// of the vector elements.
6453static bool areExtractExts(Value *Ext1, Value *Ext2) {
6454 auto areExtDoubled = [](Instruction *Ext) {
6455 return Ext->getType()->getScalarSizeInBits() ==
6456 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6457 };
6458
6459 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6460 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6461 !areExtDoubled(cast<Instruction>(Ext1)) ||
6462 !areExtDoubled(cast<Instruction>(Ext2)))
6463 return false;
6464
6465 return true;
6466}
6467
6468/// Check if Op could be used with vmull_high_p64 intrinsic.
6470 Value *VectorOperand = nullptr;
6471 ConstantInt *ElementIndex = nullptr;
6472 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6473 m_ConstantInt(ElementIndex))) &&
6474 ElementIndex->getValue() == 1 &&
6475 isa<FixedVectorType>(VectorOperand->getType()) &&
6476 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6477}
6478
6479/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6480static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6482}
6483
6485 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6486 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6487 if (!GEP || GEP->getNumOperands() != 2)
6488 return false;
6489
6490 Value *Base = GEP->getOperand(0);
6491 Value *Offsets = GEP->getOperand(1);
6492
6493 // We only care about scalar_base+vector_offsets.
6494 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6495 return false;
6496
6497 // Sink extends that would allow us to use 32-bit offset vectors.
6498 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6499 auto *OffsetsInst = cast<Instruction>(Offsets);
6500 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6501 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6502 Ops.push_back(&GEP->getOperandUse(1));
6503 }
6504
6505 // Sink the GEP.
6506 return true;
6507}
6508
6509/// We want to sink following cases:
6510/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
6511/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
6513 if (match(Op, m_VScale()))
6514 return true;
6515 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
6517 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6518 return true;
6519 }
6520 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
6522 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
6523 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
6524 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
6525 return true;
6526 }
6527 return false;
6528}
6529
6530/// Check if sinking \p I's operands to I's basic block is profitable, because
6531/// the operands can be folded into a target instruction, e.g.
6532/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6536 switch (II->getIntrinsicID()) {
6537 case Intrinsic::aarch64_neon_smull:
6538 case Intrinsic::aarch64_neon_umull:
6539 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
6540 /*AllowSplat=*/true)) {
6541 Ops.push_back(&II->getOperandUse(0));
6542 Ops.push_back(&II->getOperandUse(1));
6543 return true;
6544 }
6545 [[fallthrough]];
6546
6547 case Intrinsic::fma:
6548 case Intrinsic::fmuladd:
6549 if (isa<VectorType>(I->getType()) &&
6550 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6551 !ST->hasFullFP16())
6552 return false;
6553 [[fallthrough]];
6554 case Intrinsic::aarch64_neon_sqdmull:
6555 case Intrinsic::aarch64_neon_sqdmulh:
6556 case Intrinsic::aarch64_neon_sqrdmulh:
6557 // Sink splats for index lane variants
6558 if (isSplatShuffle(II->getOperand(0)))
6559 Ops.push_back(&II->getOperandUse(0));
6560 if (isSplatShuffle(II->getOperand(1)))
6561 Ops.push_back(&II->getOperandUse(1));
6562 return !Ops.empty();
6563 case Intrinsic::aarch64_neon_fmlal:
6564 case Intrinsic::aarch64_neon_fmlal2:
6565 case Intrinsic::aarch64_neon_fmlsl:
6566 case Intrinsic::aarch64_neon_fmlsl2:
6567 // Sink splats for index lane variants
6568 if (isSplatShuffle(II->getOperand(1)))
6569 Ops.push_back(&II->getOperandUse(1));
6570 if (isSplatShuffle(II->getOperand(2)))
6571 Ops.push_back(&II->getOperandUse(2));
6572 return !Ops.empty();
6573 case Intrinsic::aarch64_sve_ptest_first:
6574 case Intrinsic::aarch64_sve_ptest_last:
6575 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
6576 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6577 Ops.push_back(&II->getOperandUse(0));
6578 return !Ops.empty();
6579 case Intrinsic::aarch64_sme_write_horiz:
6580 case Intrinsic::aarch64_sme_write_vert:
6581 case Intrinsic::aarch64_sme_writeq_horiz:
6582 case Intrinsic::aarch64_sme_writeq_vert: {
6583 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
6584 if (!Idx || Idx->getOpcode() != Instruction::Add)
6585 return false;
6586 Ops.push_back(&II->getOperandUse(1));
6587 return true;
6588 }
6589 case Intrinsic::aarch64_sme_read_horiz:
6590 case Intrinsic::aarch64_sme_read_vert:
6591 case Intrinsic::aarch64_sme_readq_horiz:
6592 case Intrinsic::aarch64_sme_readq_vert:
6593 case Intrinsic::aarch64_sme_ld1b_vert:
6594 case Intrinsic::aarch64_sme_ld1h_vert:
6595 case Intrinsic::aarch64_sme_ld1w_vert:
6596 case Intrinsic::aarch64_sme_ld1d_vert:
6597 case Intrinsic::aarch64_sme_ld1q_vert:
6598 case Intrinsic::aarch64_sme_st1b_vert:
6599 case Intrinsic::aarch64_sme_st1h_vert:
6600 case Intrinsic::aarch64_sme_st1w_vert:
6601 case Intrinsic::aarch64_sme_st1d_vert:
6602 case Intrinsic::aarch64_sme_st1q_vert:
6603 case Intrinsic::aarch64_sme_ld1b_horiz:
6604 case Intrinsic::aarch64_sme_ld1h_horiz:
6605 case Intrinsic::aarch64_sme_ld1w_horiz:
6606 case Intrinsic::aarch64_sme_ld1d_horiz:
6607 case Intrinsic::aarch64_sme_ld1q_horiz:
6608 case Intrinsic::aarch64_sme_st1b_horiz:
6609 case Intrinsic::aarch64_sme_st1h_horiz:
6610 case Intrinsic::aarch64_sme_st1w_horiz:
6611 case Intrinsic::aarch64_sme_st1d_horiz:
6612 case Intrinsic::aarch64_sme_st1q_horiz: {
6613 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
6614 if (!Idx || Idx->getOpcode() != Instruction::Add)
6615 return false;
6616 Ops.push_back(&II->getOperandUse(3));
6617 return true;
6618 }
6619 case Intrinsic::aarch64_neon_pmull:
6620 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
6621 return false;
6622 Ops.push_back(&II->getOperandUse(0));
6623 Ops.push_back(&II->getOperandUse(1));
6624 return true;
6625 case Intrinsic::aarch64_neon_pmull64:
6626 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
6627 II->getArgOperand(1)))
6628 return false;
6629 Ops.push_back(&II->getArgOperandUse(0));
6630 Ops.push_back(&II->getArgOperandUse(1));
6631 return true;
6632 case Intrinsic::masked_gather:
6633 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
6634 return false;
6635 Ops.push_back(&II->getArgOperandUse(0));
6636 return true;
6637 case Intrinsic::masked_scatter:
6638 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
6639 return false;
6640 Ops.push_back(&II->getArgOperandUse(1));
6641 return true;
6642 default:
6643 return false;
6644 }
6645 }
6646
6647 auto ShouldSinkCondition = [](Value *Cond,
6648 SmallVectorImpl<Use *> &Ops) -> bool {
6650 return false;
6652 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
6653 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
6654 return false;
6655 if (isa<CmpInst>(II->getOperand(0)))
6656 Ops.push_back(&II->getOperandUse(0));
6657 return true;
6658 };
6659
6660 switch (I->getOpcode()) {
6661 case Instruction::GetElementPtr:
6662 case Instruction::Add:
6663 case Instruction::Sub:
6664 // Sink vscales closer to uses for better isel
6665 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
6666 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
6667 Ops.push_back(&I->getOperandUse(Op));
6668 return true;
6669 }
6670 }
6671 break;
6672 case Instruction::Select: {
6673 if (!ShouldSinkCondition(I->getOperand(0), Ops))
6674 return false;
6675
6676 Ops.push_back(&I->getOperandUse(0));
6677 return true;
6678 }
6679 case Instruction::Br: {
6680 if (cast<BranchInst>(I)->isUnconditional())
6681 return false;
6682
6683 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
6684 return false;
6685
6686 Ops.push_back(&I->getOperandUse(0));
6687 return true;
6688 }
6689 default:
6690 break;
6691 }
6692
6693 if (!I->getType()->isVectorTy())
6694 return false;
6695
6696 switch (I->getOpcode()) {
6697 case Instruction::Sub:
6698 case Instruction::Add: {
6699 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
6700 return false;
6701
6702 // If the exts' operands extract either the lower or upper elements, we
6703 // can sink them too.
6704 auto Ext1 = cast<Instruction>(I->getOperand(0));
6705 auto Ext2 = cast<Instruction>(I->getOperand(1));
6706 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
6707 Ops.push_back(&Ext1->getOperandUse(0));
6708 Ops.push_back(&Ext2->getOperandUse(0));
6709 }
6710
6711 Ops.push_back(&I->getOperandUse(0));
6712 Ops.push_back(&I->getOperandUse(1));
6713
6714 return true;
6715 }
6716 case Instruction::Or: {
6717 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6718 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6719 if (ST->hasNEON()) {
6720 Instruction *OtherAnd, *IA, *IB;
6721 Value *MaskValue;
6722 // MainAnd refers to And instruction that has 'Not' as one of its operands
6723 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
6724 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
6725 m_Instruction(IA)))))) {
6726 if (match(OtherAnd,
6727 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
6728 Instruction *MainAnd = I->getOperand(0) == OtherAnd
6729 ? cast<Instruction>(I->getOperand(1))
6730 : cast<Instruction>(I->getOperand(0));
6731
6732 // Both Ands should be in same basic block as Or
6733 if (I->getParent() != MainAnd->getParent() ||
6734 I->getParent() != OtherAnd->getParent())
6735 return false;
6736
6737 // Non-mask operands of both Ands should also be in same basic block
6738 if (I->getParent() != IA->getParent() ||
6739 I->getParent() != IB->getParent())
6740 return false;
6741
6742 Ops.push_back(
6743 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
6744 Ops.push_back(&I->getOperandUse(0));
6745 Ops.push_back(&I->getOperandUse(1));
6746
6747 return true;
6748 }
6749 }
6750 }
6751
6752 return false;
6753 }
6754 case Instruction::Mul: {
6755 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6756 auto *Ty = cast<VectorType>(V->getType());
6757 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6758 if (Ty->isScalableTy())
6759 return false;
6760
6761 // Indexed variants of Mul exist for i16 and i32 element types only.
6762 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
6763 };
6764
6765 int NumZExts = 0, NumSExts = 0;
6766 for (auto &Op : I->operands()) {
6767 // Make sure we are not already sinking this operand
6768 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
6769 continue;
6770
6771 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
6772 auto *Ext = cast<Instruction>(Op);
6773 auto *ExtOp = Ext->getOperand(0);
6774 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
6775 Ops.push_back(&Ext->getOperandUse(0));
6776 Ops.push_back(&Op);
6777
6778 if (isa<SExtInst>(Ext)) {
6779 NumSExts++;
6780 } else {
6781 NumZExts++;
6782 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6783 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
6784 I->getType()->getScalarSizeInBits())
6785 NumSExts++;
6786 }
6787
6788 continue;
6789 }
6790
6792 if (!Shuffle)
6793 continue;
6794
6795 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
6796 // operand and the s/zext can help create indexed s/umull. This is
6797 // especially useful to prevent i64 mul being scalarized.
6798 if (isSplatShuffle(Shuffle) &&
6799 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
6800 Ops.push_back(&Shuffle->getOperandUse(0));
6801 Ops.push_back(&Op);
6802 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
6803 NumSExts++;
6804 else
6805 NumZExts++;
6806 continue;
6807 }
6808
6809 Value *ShuffleOperand = Shuffle->getOperand(0);
6810 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
6811 if (!Insert)
6812 continue;
6813
6814 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
6815 if (!OperandInstr)
6816 continue;
6817
6818 ConstantInt *ElementConstant =
6819 dyn_cast<ConstantInt>(Insert->getOperand(2));
6820 // Check that the insertelement is inserting into element 0
6821 if (!ElementConstant || !ElementConstant->isZero())
6822 continue;
6823
6824 unsigned Opcode = OperandInstr->getOpcode();
6825 if (Opcode == Instruction::SExt)
6826 NumSExts++;
6827 else if (Opcode == Instruction::ZExt)
6828 NumZExts++;
6829 else {
6830 // If we find that the top bits are known 0, then we can sink and allow
6831 // the backend to generate a umull.
6832 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6833 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
6834 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
6835 continue;
6836 NumZExts++;
6837 }
6838
6839 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6840 // the And, just to hoist it again back to the load.
6841 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
6842 Ops.push_back(&Insert->getOperandUse(1));
6843 Ops.push_back(&Shuffle->getOperandUse(0));
6844 Ops.push_back(&Op);
6845 }
6846
6847 // It is profitable to sink if we found two of the same type of extends.
6848 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
6849 return true;
6850
6851 // Otherwise, see if we should sink splats for indexed variants.
6852 if (!ShouldSinkSplatForIndexedVariant(I))
6853 return false;
6854
6855 Ops.clear();
6856 if (isSplatShuffle(I->getOperand(0)))
6857 Ops.push_back(&I->getOperandUse(0));
6858 if (isSplatShuffle(I->getOperand(1)))
6859 Ops.push_back(&I->getOperandUse(1));
6860
6861 return !Ops.empty();
6862 }
6863 case Instruction::FMul: {
6864 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6865 if (I->getType()->isScalableTy())
6866 return false;
6867
6868 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
6869 !ST->hasFullFP16())
6870 return false;
6871
6872 // Sink splats for index lane variants
6873 if (isSplatShuffle(I->getOperand(0)))
6874 Ops.push_back(&I->getOperandUse(0));
6875 if (isSplatShuffle(I->getOperand(1)))
6876 Ops.push_back(&I->getOperandUse(1));
6877 return !Ops.empty();
6878 }
6879 default:
6880 return false;
6881 }
6882 return false;
6883}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned countLeadingOnes() const
Definition APInt.h:1625
void negate()
Negate this APInt in place.
Definition APInt.h:1469
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
unsigned logBase2() const
Definition APInt.h:1762
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
bool isUnsigned() const
Definition InstrTypes.h:936
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:760
bool empty() const
Definition DenseMap.h:109
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1107
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:595
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:580
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1926
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1714
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2280
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition MapVector.h:56
Information for memory intrinsic cost model.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:702
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool isFixedLengthVector() const
Definition ValueTypes.h:181
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...