LLVM 20.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
29#include <algorithm>
30#include <optional>
31using namespace llvm;
32using namespace llvm::PatternMatch;
33
34#define DEBUG_TYPE "aarch64tti"
35
36static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37 cl::init(true), cl::Hidden);
38
40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
42static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
44
45static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46 cl::init(10), cl::Hidden);
47
48static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49 cl::init(15), cl::Hidden);
50
52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
54
56 "call-penalty-sm-change", cl::init(5), cl::Hidden,
58 "Penalty of calling a function that requires a change to PSTATE.SM"));
59
61 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
64static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65 cl::init(true), cl::Hidden);
66
67static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68 cl::init(true), cl::Hidden);
69
70// A complete guess as to a reasonable cost.
72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
73 cl::desc("The cost of a histcnt instruction"));
74
76 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
77 cl::desc("The number of instructions to search for a redundant dmb"));
78
79namespace {
80class TailFoldingOption {
81 // These bitfields will only ever be set to something non-zero in operator=,
82 // when setting the -sve-tail-folding option. This option should always be of
83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84 // InitialBits is one of (disabled|all|simple). EnableBits represents
85 // additional flags we're enabling, and DisableBits for those flags we're
86 // disabling. The default flag is tracked in the variable NeedsDefault, since
87 // at the time of setting the option we may not know what the default value
88 // for the CPU is.
89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93 // This value needs to be initialised to true in case the user does not
94 // explicitly set the -sve-tail-folding option.
95 bool NeedsDefault = true;
96
97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99 void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101 void setEnableBit(TailFoldingOpts Bit) {
102 EnableBits |= Bit;
103 DisableBits &= ~Bit;
104 }
105
106 void setDisableBit(TailFoldingOpts Bit) {
107 EnableBits &= ~Bit;
108 DisableBits |= Bit;
109 }
110
111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115 "Initial bits should only include one of "
116 "(disabled|all|simple|default)");
117 Bits = NeedsDefault ? DefaultBits : InitialBits;
118 Bits |= EnableBits;
119 Bits &= ~DisableBits;
120
121 return Bits;
122 }
123
124 void reportError(std::string Opt) {
125 errs() << "invalid argument '" << Opt
126 << "' to -sve-tail-folding=; the option should be of the form\n"
127 " (disabled|all|default|simple)[+(reductions|recurrences"
128 "|reverse|noreductions|norecurrences|noreverse)]\n";
129 report_fatal_error("Unrecognised tail-folding option");
130 }
131
132public:
133
134 void operator=(const std::string &Val) {
135 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 if (Val.empty()) {
137 reportError("");
138 return;
139 }
140
141 // Since the user is explicitly setting the option we don't automatically
142 // need the default unless they require it.
143 setNeedsDefault(false);
144
145 SmallVector<StringRef, 4> TailFoldTypes;
146 StringRef(Val).split(TailFoldTypes, '+', -1, false);
147
148 unsigned StartIdx = 1;
149 if (TailFoldTypes[0] == "disabled")
150 setInitialBits(TailFoldingOpts::Disabled);
151 else if (TailFoldTypes[0] == "all")
152 setInitialBits(TailFoldingOpts::All);
153 else if (TailFoldTypes[0] == "default")
154 setNeedsDefault(true);
155 else if (TailFoldTypes[0] == "simple")
156 setInitialBits(TailFoldingOpts::Simple);
157 else {
158 StartIdx = 0;
159 setInitialBits(TailFoldingOpts::Disabled);
160 }
161
162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163 if (TailFoldTypes[I] == "reductions")
164 setEnableBit(TailFoldingOpts::Reductions);
165 else if (TailFoldTypes[I] == "recurrences")
166 setEnableBit(TailFoldingOpts::Recurrences);
167 else if (TailFoldTypes[I] == "reverse")
168 setEnableBit(TailFoldingOpts::Reverse);
169 else if (TailFoldTypes[I] == "noreductions")
170 setDisableBit(TailFoldingOpts::Reductions);
171 else if (TailFoldTypes[I] == "norecurrences")
172 setDisableBit(TailFoldingOpts::Recurrences);
173 else if (TailFoldTypes[I] == "noreverse")
174 setDisableBit(TailFoldingOpts::Reverse);
175 else
176 reportError(Val);
177 }
178 }
179
180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181 return (getBits(DefaultBits) & Required) == Required;
182 }
183};
184} // namespace
185
186TailFoldingOption TailFoldingOptionLoc;
187
189 "sve-tail-folding",
190 cl::desc(
191 "Control the use of vectorisation using tail-folding for SVE where the"
192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193 "\ndisabled (Initial) No loop types will vectorize using "
194 "tail-folding"
195 "\ndefault (Initial) Uses the default tail-folding settings for "
196 "the target CPU"
197 "\nall (Initial) All legal loop types will vectorize using "
198 "tail-folding"
199 "\nsimple (Initial) Use tail-folding for simple loops (not "
200 "reductions or recurrences)"
201 "\nreductions Use tail-folding for loops containing reductions"
202 "\nnoreductions Inverse of above"
203 "\nrecurrences Use tail-folding for loops containing fixed order "
204 "recurrences"
205 "\nnorecurrences Inverse of above"
206 "\nreverse Use tail-folding for loops requiring reversed "
207 "predicates"
208 "\nnoreverse Inverse of above"),
210
211// Experimental option that will only be fully functional when the
212// code-generator is changed to use SVE instead of NEON for all fixed-width
213// operations.
215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
216
217// Experimental option that will only be fully functional when the cost-model
218// and code-generator have been changed to avoid using scalable vector
219// instructions that are not legal in streaming SVE mode.
221 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
222
223static bool isSMEABIRoutineCall(const CallInst &CI) {
224 const auto *F = CI.getCalledFunction();
225 return F && StringSwitch<bool>(F->getName())
226 .Case("__arm_sme_state", true)
227 .Case("__arm_tpidr2_save", true)
228 .Case("__arm_tpidr2_restore", true)
229 .Case("__arm_za_disable", true)
230 .Default(false);
231}
232
233/// Returns true if the function has explicit operations that can only be
234/// lowered using incompatible instructions for the selected mode. This also
235/// returns true if the function F may use or modify ZA state.
237 for (const BasicBlock &BB : *F) {
238 for (const Instruction &I : BB) {
239 // Be conservative for now and assume that any call to inline asm or to
240 // intrinsics could could result in non-streaming ops (e.g. calls to
241 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
242 // all native LLVM instructions can be lowered to compatible instructions.
243 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
244 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245 isSMEABIRoutineCall(cast<CallInst>(I))))
246 return true;
247 }
248 }
249 return false;
250}
251
253 StringRef AttributeStr =
254 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
257 FeatureStr.split(Features, ",");
258 return AArch64::getFMVPriority(Features);
259}
260
262 return F.hasFnAttribute("fmv-features");
263}
264
266 const Function *Callee) const {
267 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
268
269 // When inlining, we should consider the body of the function, not the
270 // interface.
271 if (CalleeAttrs.hasStreamingBody()) {
272 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
273 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
274 }
275
276 if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
277 return false;
278
279 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
280 CallerAttrs.requiresSMChange(CalleeAttrs) ||
281 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
282 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
283 if (hasPossibleIncompatibleOps(Callee))
284 return false;
285 }
286
287 return BaseT::areInlineCompatible(Caller, Callee);
288}
289
291 const Function *Caller, const Function *Callee,
292 const ArrayRef<Type *> &Types) const {
293 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
294 return false;
295
296 // We need to ensure that argument promotion does not attempt to promote
297 // pointers to fixed-length vector types larger than 128 bits like
298 // <8 x float> (and pointers to aggregate types which have such fixed-length
299 // vector type members) into the values of the pointees. Such vector types
300 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
301 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
302 // types can be safely treated as 128-bit NEON types and they cannot be
303 // distinguished in IR.
304 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
305 auto FVTy = dyn_cast<FixedVectorType>(Ty);
306 return FVTy &&
307 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
308 }))
309 return false;
310
311 return true;
312}
313
314unsigned
316 unsigned DefaultCallPenalty) const {
317 // This function calculates a penalty for executing Call in F.
318 //
319 // There are two ways this function can be called:
320 // (1) F:
321 // call from F -> G (the call here is Call)
322 //
323 // For (1), Call.getCaller() == F, so it will always return a high cost if
324 // a streaming-mode change is required (thus promoting the need to inline the
325 // function)
326 //
327 // (2) F:
328 // call from F -> G (the call here is not Call)
329 // G:
330 // call from G -> H (the call here is Call)
331 //
332 // For (2), if after inlining the body of G into F the call to H requires a
333 // streaming-mode change, and the call to G from F would also require a
334 // streaming-mode change, then there is benefit to do the streaming-mode
335 // change only once and avoid inlining of G into F.
336 SMEAttrs FAttrs(*F);
337 SMEAttrs CalleeAttrs(Call);
338 if (FAttrs.requiresSMChange(CalleeAttrs)) {
339 if (F == Call.getCaller()) // (1)
340 return CallPenaltyChangeSM * DefaultCallPenalty;
341 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
342 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
343 }
344
345 return DefaultCallPenalty;
346}
347
352 ST->isNeonAvailable());
353}
354
355/// Calculate the cost of materializing a 64-bit value. This helper
356/// method might only calculate a fraction of a larger immediate. Therefore it
357/// is valid to return a cost of ZERO.
359 // Check if the immediate can be encoded within an instruction.
360 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
361 return 0;
362
363 if (Val < 0)
364 Val = ~Val;
365
366 // Calculate how many moves we will need to materialize this constant.
369 return Insn.size();
370}
371
372/// Calculate the cost of materializing the given constant.
375 assert(Ty->isIntegerTy());
376
377 unsigned BitSize = Ty->getPrimitiveSizeInBits();
378 if (BitSize == 0)
379 return ~0U;
380
381 // Sign-extend all constants to a multiple of 64-bit.
382 APInt ImmVal = Imm;
383 if (BitSize & 0x3f)
384 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
385
386 // Split the constant into 64-bit chunks and calculate the cost for each
387 // chunk.
389 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
390 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
391 int64_t Val = Tmp.getSExtValue();
392 Cost += getIntImmCost(Val);
393 }
394 // We need at least one instruction to materialze the constant.
395 return std::max<InstructionCost>(1, Cost);
396}
397
399 const APInt &Imm, Type *Ty,
401 Instruction *Inst) {
402 assert(Ty->isIntegerTy());
403
404 unsigned BitSize = Ty->getPrimitiveSizeInBits();
405 // There is no cost model for constants with a bit size of 0. Return TCC_Free
406 // here, so that constant hoisting will ignore this constant.
407 if (BitSize == 0)
408 return TTI::TCC_Free;
409
410 unsigned ImmIdx = ~0U;
411 switch (Opcode) {
412 default:
413 return TTI::TCC_Free;
414 case Instruction::GetElementPtr:
415 // Always hoist the base address of a GetElementPtr.
416 if (Idx == 0)
417 return 2 * TTI::TCC_Basic;
418 return TTI::TCC_Free;
419 case Instruction::Store:
420 ImmIdx = 0;
421 break;
422 case Instruction::Add:
423 case Instruction::Sub:
424 case Instruction::Mul:
425 case Instruction::UDiv:
426 case Instruction::SDiv:
427 case Instruction::URem:
428 case Instruction::SRem:
429 case Instruction::And:
430 case Instruction::Or:
431 case Instruction::Xor:
432 case Instruction::ICmp:
433 ImmIdx = 1;
434 break;
435 // Always return TCC_Free for the shift value of a shift instruction.
436 case Instruction::Shl:
437 case Instruction::LShr:
438 case Instruction::AShr:
439 if (Idx == 1)
440 return TTI::TCC_Free;
441 break;
442 case Instruction::Trunc:
443 case Instruction::ZExt:
444 case Instruction::SExt:
445 case Instruction::IntToPtr:
446 case Instruction::PtrToInt:
447 case Instruction::BitCast:
448 case Instruction::PHI:
449 case Instruction::Call:
450 case Instruction::Select:
451 case Instruction::Ret:
452 case Instruction::Load:
453 break;
454 }
455
456 if (Idx == ImmIdx) {
457 int NumConstants = (BitSize + 63) / 64;
459 return (Cost <= NumConstants * TTI::TCC_Basic)
460 ? static_cast<int>(TTI::TCC_Free)
461 : Cost;
462 }
464}
465
468 const APInt &Imm, Type *Ty,
470 assert(Ty->isIntegerTy());
471
472 unsigned BitSize = Ty->getPrimitiveSizeInBits();
473 // There is no cost model for constants with a bit size of 0. Return TCC_Free
474 // here, so that constant hoisting will ignore this constant.
475 if (BitSize == 0)
476 return TTI::TCC_Free;
477
478 // Most (all?) AArch64 intrinsics do not support folding immediates into the
479 // selected instruction, so we compute the materialization cost for the
480 // immediate directly.
481 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
483
484 switch (IID) {
485 default:
486 return TTI::TCC_Free;
487 case Intrinsic::sadd_with_overflow:
488 case Intrinsic::uadd_with_overflow:
489 case Intrinsic::ssub_with_overflow:
490 case Intrinsic::usub_with_overflow:
491 case Intrinsic::smul_with_overflow:
492 case Intrinsic::umul_with_overflow:
493 if (Idx == 1) {
494 int NumConstants = (BitSize + 63) / 64;
496 return (Cost <= NumConstants * TTI::TCC_Basic)
497 ? static_cast<int>(TTI::TCC_Free)
498 : Cost;
499 }
500 break;
501 case Intrinsic::experimental_stackmap:
502 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
503 return TTI::TCC_Free;
504 break;
505 case Intrinsic::experimental_patchpoint_void:
506 case Intrinsic::experimental_patchpoint:
507 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
508 return TTI::TCC_Free;
509 break;
510 case Intrinsic::experimental_gc_statepoint:
511 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
512 return TTI::TCC_Free;
513 break;
514 }
516}
517
520 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
521 if (TyWidth == 32 || TyWidth == 64)
523 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
524 return TTI::PSK_Software;
525}
526
527static bool isUnpackedVectorVT(EVT VecVT) {
528 return VecVT.isScalableVector() &&
530}
531
533 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
534 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
535 unsigned TotalHistCnts = 1;
536
537 unsigned EltSize = EltTy->getScalarSizeInBits();
538 // Only allow (up to 64b) integers or pointers
539 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
541
542 // FIXME: We should be able to generate histcnt for fixed-length vectors
543 // using ptrue with a specific VL.
544 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
545 unsigned EC = VTy->getElementCount().getKnownMinValue();
546 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
548
549 // HistCnt only supports 32b and 64b element types
550 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
551
552 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
554
555 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
556 TotalHistCnts = EC / NaturalVectorWidth;
557 }
558
559 return InstructionCost(BaseHistCntCost * TotalHistCnts);
560}
561
565 // The code-generator is currently not able to handle scalable vectors
566 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
567 // it. This change will be removed when code-generation for these types is
568 // sufficiently reliable.
569 auto *RetTy = ICA.getReturnType();
570 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
571 if (VTy->getElementCount() == ElementCount::getScalable(1))
573
574 switch (ICA.getID()) {
575 case Intrinsic::experimental_vector_histogram_add:
576 if (!ST->hasSVE2())
578 return getHistogramCost(ICA);
579 case Intrinsic::umin:
580 case Intrinsic::umax:
581 case Intrinsic::smin:
582 case Intrinsic::smax: {
583 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
584 MVT::v8i16, MVT::v2i32, MVT::v4i32,
585 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
586 MVT::nxv2i64};
588 // v2i64 types get converted to cmp+bif hence the cost of 2
589 if (LT.second == MVT::v2i64)
590 return LT.first * 2;
591 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
592 return LT.first;
593 break;
594 }
595 case Intrinsic::sadd_sat:
596 case Intrinsic::ssub_sat:
597 case Intrinsic::uadd_sat:
598 case Intrinsic::usub_sat: {
599 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
600 MVT::v8i16, MVT::v2i32, MVT::v4i32,
601 MVT::v2i64};
603 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
604 // need to extend the type, as it uses shr(qadd(shl, shl)).
605 unsigned Instrs =
606 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
607 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
608 return LT.first * Instrs;
609 break;
610 }
611 case Intrinsic::abs: {
612 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
613 MVT::v8i16, MVT::v2i32, MVT::v4i32,
614 MVT::v2i64};
616 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
617 return LT.first;
618 break;
619 }
620 case Intrinsic::bswap: {
621 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
622 MVT::v4i32, MVT::v2i64};
624 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
625 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
626 return LT.first;
627 break;
628 }
629 case Intrinsic::stepvector: {
630 InstructionCost Cost = 1; // Cost of the `index' instruction
632 // Legalisation of illegal vectors involves an `index' instruction plus
633 // (LT.first - 1) vector adds.
634 if (LT.first > 1) {
635 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
636 InstructionCost AddCost =
637 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
638 Cost += AddCost * (LT.first - 1);
639 }
640 return Cost;
641 }
642 case Intrinsic::vector_extract:
643 case Intrinsic::vector_insert: {
644 // If both the vector and subvector types are legal types and the index
645 // is 0, then this should be a no-op or simple operation; return a
646 // relatively low cost.
647
648 // If arguments aren't actually supplied, then we cannot determine the
649 // value of the index. We also want to skip predicate types.
650 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
652 break;
653
654 LLVMContext &C = RetTy->getContext();
655 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
656 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
657 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
658 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
659 // Skip this if either the vector or subvector types are unpacked
660 // SVE types; they may get lowered to stack stores and loads.
661 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
662 break;
663
665 getTLI()->getTypeConversion(C, SubVecVT);
667 getTLI()->getTypeConversion(C, VecVT);
668 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
669 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
670 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
671 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
672 return TTI::TCC_Free;
673 break;
674 }
675 case Intrinsic::bitreverse: {
676 static const CostTblEntry BitreverseTbl[] = {
677 {Intrinsic::bitreverse, MVT::i32, 1},
678 {Intrinsic::bitreverse, MVT::i64, 1},
679 {Intrinsic::bitreverse, MVT::v8i8, 1},
680 {Intrinsic::bitreverse, MVT::v16i8, 1},
681 {Intrinsic::bitreverse, MVT::v4i16, 2},
682 {Intrinsic::bitreverse, MVT::v8i16, 2},
683 {Intrinsic::bitreverse, MVT::v2i32, 2},
684 {Intrinsic::bitreverse, MVT::v4i32, 2},
685 {Intrinsic::bitreverse, MVT::v1i64, 2},
686 {Intrinsic::bitreverse, MVT::v2i64, 2},
687 };
688 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
689 const auto *Entry =
690 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
691 if (Entry) {
692 // Cost Model is using the legal type(i32) that i8 and i16 will be
693 // converted to +1 so that we match the actual lowering cost
694 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
695 TLI->getValueType(DL, RetTy, true) == MVT::i16)
696 return LegalisationCost.first * Entry->Cost + 1;
697
698 return LegalisationCost.first * Entry->Cost;
699 }
700 break;
701 }
702 case Intrinsic::ctpop: {
703 if (!ST->hasNEON()) {
704 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
705 return getTypeLegalizationCost(RetTy).first * 12;
706 }
707 static const CostTblEntry CtpopCostTbl[] = {
708 {ISD::CTPOP, MVT::v2i64, 4},
709 {ISD::CTPOP, MVT::v4i32, 3},
710 {ISD::CTPOP, MVT::v8i16, 2},
711 {ISD::CTPOP, MVT::v16i8, 1},
712 {ISD::CTPOP, MVT::i64, 4},
713 {ISD::CTPOP, MVT::v2i32, 3},
714 {ISD::CTPOP, MVT::v4i16, 2},
715 {ISD::CTPOP, MVT::v8i8, 1},
716 {ISD::CTPOP, MVT::i32, 5},
717 };
719 MVT MTy = LT.second;
720 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
721 // Extra cost of +1 when illegal vector types are legalized by promoting
722 // the integer type.
723 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
724 RetTy->getScalarSizeInBits()
725 ? 1
726 : 0;
727 return LT.first * Entry->Cost + ExtraCost;
728 }
729 break;
730 }
731 case Intrinsic::sadd_with_overflow:
732 case Intrinsic::uadd_with_overflow:
733 case Intrinsic::ssub_with_overflow:
734 case Intrinsic::usub_with_overflow:
735 case Intrinsic::smul_with_overflow:
736 case Intrinsic::umul_with_overflow: {
737 static const CostTblEntry WithOverflowCostTbl[] = {
738 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
739 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
740 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
741 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
742 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
743 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
744 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
745 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
746 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
747 {Intrinsic::usub_with_overflow, MVT::i8, 3},
748 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
749 {Intrinsic::usub_with_overflow, MVT::i16, 3},
750 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
751 {Intrinsic::usub_with_overflow, MVT::i32, 1},
752 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
753 {Intrinsic::usub_with_overflow, MVT::i64, 1},
754 {Intrinsic::smul_with_overflow, MVT::i8, 5},
755 {Intrinsic::umul_with_overflow, MVT::i8, 4},
756 {Intrinsic::smul_with_overflow, MVT::i16, 5},
757 {Intrinsic::umul_with_overflow, MVT::i16, 4},
758 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
759 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
760 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
761 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
762 };
763 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
764 if (MTy.isSimple())
765 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
766 MTy.getSimpleVT()))
767 return Entry->Cost;
768 break;
769 }
770 case Intrinsic::fptosi_sat:
771 case Intrinsic::fptoui_sat: {
772 if (ICA.getArgTypes().empty())
773 break;
774 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
775 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
776 EVT MTy = TLI->getValueType(DL, RetTy);
777 // Check for the legal types, which are where the size of the input and the
778 // output are the same, or we are using cvt f64->i32 or f32->i64.
779 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
780 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
781 LT.second == MVT::v2f64)) {
782 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
783 (LT.second == MVT::f64 && MTy == MVT::i32) ||
784 (LT.second == MVT::f32 && MTy == MVT::i64)))
785 return LT.first;
786 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
787 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
788 MTy.getScalarSizeInBits() == 64)
789 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
790 }
791 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
792 // f32.
793 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
794 return LT.first + getIntrinsicInstrCost(
795 {ICA.getID(),
796 RetTy,
797 {ICA.getArgTypes()[0]->getWithNewType(
798 Type::getFloatTy(RetTy->getContext()))}},
799 CostKind);
800 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
801 (LT.second == MVT::f16 && MTy == MVT::i64) ||
802 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
803 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
804 return LT.first;
805 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
806 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
807 MTy.getScalarSizeInBits() == 32)
808 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
809 // Extending vector types v8f16->v8i32. These current scalarize but the
810 // codegen could be better.
811 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
812 MTy.getScalarSizeInBits() == 64)
813 return MTy.getVectorNumElements() * 3;
814
815 // If we can we use a legal convert followed by a min+max
816 if ((LT.second.getScalarType() == MVT::f32 ||
817 LT.second.getScalarType() == MVT::f64 ||
818 LT.second.getScalarType() == MVT::f16) &&
819 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
820 Type *LegalTy =
821 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
822 if (LT.second.isVector())
823 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
825 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
826 LegalTy, {LegalTy, LegalTy});
828 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
829 LegalTy, {LegalTy, LegalTy});
831 return LT.first * Cost +
832 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
833 : 1);
834 }
835 // Otherwise we need to follow the default expansion that clamps the value
836 // using a float min/max with a fcmp+sel for nan handling when signed.
837 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
838 RetTy = RetTy->getScalarType();
839 if (LT.second.isVector()) {
840 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
841 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
842 }
843 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
845 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
847 Cost +=
848 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
850 if (IsSigned) {
851 Type *CondTy = RetTy->getWithNewBitWidth(1);
852 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
854 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
856 }
857 return LT.first * Cost;
858 }
859 case Intrinsic::fshl:
860 case Intrinsic::fshr: {
861 if (ICA.getArgs().empty())
862 break;
863
864 // TODO: Add handling for fshl where third argument is not a constant.
865 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
866 if (!OpInfoZ.isConstant())
867 break;
868
869 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
870 if (OpInfoZ.isUniform()) {
871 // FIXME: The costs could be lower if the codegen is better.
872 static const CostTblEntry FshlTbl[] = {
873 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
874 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
875 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
876 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
877 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
878 // to avoid having to duplicate the costs.
879 const auto *Entry =
880 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
881 if (Entry)
882 return LegalisationCost.first * Entry->Cost;
883 }
884
885 auto TyL = getTypeLegalizationCost(RetTy);
886 if (!RetTy->isIntegerTy())
887 break;
888
889 // Estimate cost manually, as types like i8 and i16 will get promoted to
890 // i32 and CostTableLookup will ignore the extra conversion cost.
891 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
892 RetTy->getScalarSizeInBits() < 64) ||
893 (RetTy->getScalarSizeInBits() % 64 != 0);
894 unsigned ExtraCost = HigherCost ? 1 : 0;
895 if (RetTy->getScalarSizeInBits() == 32 ||
896 RetTy->getScalarSizeInBits() == 64)
897 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
898 // extr instruction.
899 else if (HigherCost)
900 ExtraCost = 1;
901 else
902 break;
903 return TyL.first + ExtraCost;
904 }
905 case Intrinsic::get_active_lane_mask: {
906 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
907 if (RetTy) {
908 EVT RetVT = getTLI()->getValueType(DL, RetTy);
909 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
910 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
911 !getTLI()->isTypeLegal(RetVT)) {
912 // We don't have enough context at this point to determine if the mask
913 // is going to be kept live after the block, which will force the vXi1
914 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
915 // For now, we just assume the vectorizer created this intrinsic and
916 // the result will be the input for a PHI. In this case the cost will
917 // be extremely high for fixed-width vectors.
918 // NOTE: getScalarizationOverhead returns a cost that's far too
919 // pessimistic for the actual generated codegen. In reality there are
920 // two instructions generated per lane.
921 return RetTy->getNumElements() * 2;
922 }
923 }
924 break;
925 }
926 case Intrinsic::experimental_vector_match: {
927 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
928 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
929 unsigned SearchSize = NeedleTy->getNumElements();
930 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
931 // Base cost for MATCH instructions. At least on the Neoverse V2 and
932 // Neoverse V3, these are cheap operations with the same latency as a
933 // vector ADD. In most cases, however, we also need to do an extra DUP.
934 // For fixed-length vectors we currently need an extra five--six
935 // instructions besides the MATCH.
937 if (isa<FixedVectorType>(RetTy))
938 Cost += 10;
939 return Cost;
940 }
941 break;
942 }
943 default:
944 break;
945 }
947}
948
949/// The function will remove redundant reinterprets casting in the presence
950/// of the control flow
951static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
952 IntrinsicInst &II) {
954 auto RequiredType = II.getType();
955
956 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
957 assert(PN && "Expected Phi Node!");
958
959 // Don't create a new Phi unless we can remove the old one.
960 if (!PN->hasOneUse())
961 return std::nullopt;
962
963 for (Value *IncValPhi : PN->incoming_values()) {
964 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
965 if (!Reinterpret ||
966 Reinterpret->getIntrinsicID() !=
967 Intrinsic::aarch64_sve_convert_to_svbool ||
968 RequiredType != Reinterpret->getArgOperand(0)->getType())
969 return std::nullopt;
970 }
971
972 // Create the new Phi
973 IC.Builder.SetInsertPoint(PN);
974 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
975 Worklist.push_back(PN);
976
977 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
978 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
979 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
980 Worklist.push_back(Reinterpret);
981 }
982
983 // Cleanup Phi Node and reinterprets
984 return IC.replaceInstUsesWith(II, NPN);
985}
986
987// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
988// => (binop (pred) (from_svbool _) (from_svbool _))
989//
990// The above transformation eliminates a `to_svbool` in the predicate
991// operand of bitwise operation `binop` by narrowing the vector width of
992// the operation. For example, it would convert a `<vscale x 16 x i1>
993// and` into a `<vscale x 4 x i1> and`. This is profitable because
994// to_svbool must zero the new lanes during widening, whereas
995// from_svbool is free.
996static std::optional<Instruction *>
998 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
999 if (!BinOp)
1000 return std::nullopt;
1001
1002 auto IntrinsicID = BinOp->getIntrinsicID();
1003 switch (IntrinsicID) {
1004 case Intrinsic::aarch64_sve_and_z:
1005 case Intrinsic::aarch64_sve_bic_z:
1006 case Intrinsic::aarch64_sve_eor_z:
1007 case Intrinsic::aarch64_sve_nand_z:
1008 case Intrinsic::aarch64_sve_nor_z:
1009 case Intrinsic::aarch64_sve_orn_z:
1010 case Intrinsic::aarch64_sve_orr_z:
1011 break;
1012 default:
1013 return std::nullopt;
1014 }
1015
1016 auto BinOpPred = BinOp->getOperand(0);
1017 auto BinOpOp1 = BinOp->getOperand(1);
1018 auto BinOpOp2 = BinOp->getOperand(2);
1019
1020 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1021 if (!PredIntr ||
1022 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1023 return std::nullopt;
1024
1025 auto PredOp = PredIntr->getOperand(0);
1026 auto PredOpTy = cast<VectorType>(PredOp->getType());
1027 if (PredOpTy != II.getType())
1028 return std::nullopt;
1029
1030 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1031 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1032 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1033 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1034 if (BinOpOp1 == BinOpOp2)
1035 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1036 else
1037 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1038 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1039
1040 auto NarrowedBinOp =
1041 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1042 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1043}
1044
1045static std::optional<Instruction *>
1047 // If the reinterpret instruction operand is a PHI Node
1048 if (isa<PHINode>(II.getArgOperand(0)))
1049 return processPhiNode(IC, II);
1050
1051 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1052 return BinOpCombine;
1053
1054 // Ignore converts to/from svcount_t.
1055 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1056 isa<TargetExtType>(II.getType()))
1057 return std::nullopt;
1058
1059 SmallVector<Instruction *, 32> CandidatesForRemoval;
1060 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1061
1062 const auto *IVTy = cast<VectorType>(II.getType());
1063
1064 // Walk the chain of conversions.
1065 while (Cursor) {
1066 // If the type of the cursor has fewer lanes than the final result, zeroing
1067 // must take place, which breaks the equivalence chain.
1068 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1069 if (CursorVTy->getElementCount().getKnownMinValue() <
1070 IVTy->getElementCount().getKnownMinValue())
1071 break;
1072
1073 // If the cursor has the same type as I, it is a viable replacement.
1074 if (Cursor->getType() == IVTy)
1075 EarliestReplacement = Cursor;
1076
1077 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1078
1079 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1080 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1081 Intrinsic::aarch64_sve_convert_to_svbool ||
1082 IntrinsicCursor->getIntrinsicID() ==
1083 Intrinsic::aarch64_sve_convert_from_svbool))
1084 break;
1085
1086 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1087 Cursor = IntrinsicCursor->getOperand(0);
1088 }
1089
1090 // If no viable replacement in the conversion chain was found, there is
1091 // nothing to do.
1092 if (!EarliestReplacement)
1093 return std::nullopt;
1094
1095 return IC.replaceInstUsesWith(II, EarliestReplacement);
1096}
1097
1098static bool isAllActivePredicate(Value *Pred) {
1099 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1100 Value *UncastedPred;
1101 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1102 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1103 m_Value(UncastedPred)))))
1104 // If the predicate has the same or less lanes than the uncasted
1105 // predicate then we know the casting has no effect.
1106 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1107 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1108 Pred = UncastedPred;
1109
1110 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1111 m_ConstantInt<AArch64SVEPredPattern::all>()));
1112}
1113
1114// Simplify unary operation where predicate has all inactive lanes by replacing
1115// instruction with its operand
1116static std::optional<Instruction *>
1118 bool hasInactiveVector) {
1119 int PredOperand = hasInactiveVector ? 1 : 0;
1120 int ReplaceOperand = hasInactiveVector ? 0 : 1;
1121 if (match(II.getOperand(PredOperand), m_ZeroInt())) {
1122 IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
1123 return IC.eraseInstFromFunction(II);
1124 }
1125 return std::nullopt;
1126}
1127
1128// Simplify unary operation where predicate has all inactive lanes or
1129// replace unused first operand with undef when all lanes are active
1130static std::optional<Instruction *>
1132 if (isAllActivePredicate(II.getOperand(1)) &&
1133 !isa<llvm::UndefValue>(II.getOperand(0)) &&
1134 !isa<llvm::PoisonValue>(II.getOperand(0))) {
1135 Value *Undef = llvm::UndefValue::get(II.getType());
1136 return IC.replaceOperand(II, 0, Undef);
1137 }
1138 return instCombineSVENoActiveReplace(IC, II, true);
1139}
1140
1141// Erase unary operation where predicate has all inactive lanes
1142static std::optional<Instruction *>
1144 int PredPos) {
1145 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1146 return IC.eraseInstFromFunction(II);
1147 }
1148 return std::nullopt;
1149}
1150
1151// Simplify operation where predicate has all inactive lanes by replacing
1152// instruction with zeroed object
1153static std::optional<Instruction *>
1155 if (match(II.getOperand(0), m_ZeroInt())) {
1156 Constant *Node;
1157 Type *RetTy = II.getType();
1158 if (RetTy->isStructTy()) {
1159 auto StructT = cast<StructType>(RetTy);
1160 auto VecT = StructT->getElementType(0);
1162 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1163 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1164 : ConstantInt::get(VecT, 0));
1165 }
1166 Node = ConstantStruct::get(StructT, ZerVec);
1167 } else
1168 Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1169 : ConstantInt::get(II.getType(), 0);
1170
1172 return IC.eraseInstFromFunction(II);
1173 }
1174 return std::nullopt;
1175}
1176
1177static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1178 IntrinsicInst &II) {
1179 // svsel(ptrue, x, y) => x
1180 auto *OpPredicate = II.getOperand(0);
1181 if (isAllActivePredicate(OpPredicate))
1182 return IC.replaceInstUsesWith(II, II.getOperand(1));
1183
1184 auto Select =
1185 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1186 return IC.replaceInstUsesWith(II, Select);
1187}
1188
1189static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1190 IntrinsicInst &II) {
1191 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1192 if (!Pg)
1193 return std::nullopt;
1194
1195 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1196 return std::nullopt;
1197
1198 const auto PTruePattern =
1199 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1200 if (PTruePattern != AArch64SVEPredPattern::vl1)
1201 return std::nullopt;
1202
1203 // The intrinsic is inserting into lane zero so use an insert instead.
1204 auto *IdxTy = Type::getInt64Ty(II.getContext());
1205 auto *Insert = InsertElementInst::Create(
1206 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1207 Insert->insertBefore(II.getIterator());
1208 Insert->takeName(&II);
1209
1210 return IC.replaceInstUsesWith(II, Insert);
1211}
1212
1213static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1214 IntrinsicInst &II) {
1215 // Replace DupX with a regular IR splat.
1216 auto *RetTy = cast<ScalableVectorType>(II.getType());
1217 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1218 II.getArgOperand(0));
1219 Splat->takeName(&II);
1220 return IC.replaceInstUsesWith(II, Splat);
1221}
1222
1223static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1224 IntrinsicInst &II) {
1225 LLVMContext &Ctx = II.getContext();
1226
1227 // Replace by zero constant when all lanes are inactive
1228 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1229 return II_NA;
1230
1231 // Check that the predicate is all active
1232 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1233 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1234 return std::nullopt;
1235
1236 const auto PTruePattern =
1237 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1238 if (PTruePattern != AArch64SVEPredPattern::all)
1239 return std::nullopt;
1240
1241 // Check that we have a compare of zero..
1242 auto *SplatValue =
1243 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1244 if (!SplatValue || !SplatValue->isZero())
1245 return std::nullopt;
1246
1247 // ..against a dupq
1248 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1249 if (!DupQLane ||
1250 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1251 return std::nullopt;
1252
1253 // Where the dupq is a lane 0 replicate of a vector insert
1254 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1255 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1256 return std::nullopt;
1257
1258 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1259 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1260 return std::nullopt;
1261
1262 // Where the vector insert is a fixed constant vector insert into undef at
1263 // index zero
1264 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1265 return std::nullopt;
1266
1267 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1268 return std::nullopt;
1269
1270 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1271 if (!ConstVec)
1272 return std::nullopt;
1273
1274 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1275 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1276 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1277 return std::nullopt;
1278
1279 unsigned NumElts = VecTy->getNumElements();
1280 unsigned PredicateBits = 0;
1281
1282 // Expand intrinsic operands to a 16-bit byte level predicate
1283 for (unsigned I = 0; I < NumElts; ++I) {
1284 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1285 if (!Arg)
1286 return std::nullopt;
1287 if (!Arg->isZero())
1288 PredicateBits |= 1 << (I * (16 / NumElts));
1289 }
1290
1291 // If all bits are zero bail early with an empty predicate
1292 if (PredicateBits == 0) {
1293 auto *PFalse = Constant::getNullValue(II.getType());
1294 PFalse->takeName(&II);
1295 return IC.replaceInstUsesWith(II, PFalse);
1296 }
1297
1298 // Calculate largest predicate type used (where byte predicate is largest)
1299 unsigned Mask = 8;
1300 for (unsigned I = 0; I < 16; ++I)
1301 if ((PredicateBits & (1 << I)) != 0)
1302 Mask |= (I % 8);
1303
1304 unsigned PredSize = Mask & -Mask;
1305 auto *PredType = ScalableVectorType::get(
1306 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1307
1308 // Ensure all relevant bits are set
1309 for (unsigned I = 0; I < 16; I += PredSize)
1310 if ((PredicateBits & (1 << I)) == 0)
1311 return std::nullopt;
1312
1313 auto *PTruePat =
1314 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1315 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1316 {PredType}, {PTruePat});
1317 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1318 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1319 auto *ConvertFromSVBool =
1320 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1321 {II.getType()}, {ConvertToSVBool});
1322
1323 ConvertFromSVBool->takeName(&II);
1324 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1325}
1326
1327static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1328 IntrinsicInst &II) {
1329 Value *Pg = II.getArgOperand(0);
1330 Value *Vec = II.getArgOperand(1);
1331 auto IntrinsicID = II.getIntrinsicID();
1332 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1333
1334 // lastX(splat(X)) --> X
1335 if (auto *SplatVal = getSplatValue(Vec))
1336 return IC.replaceInstUsesWith(II, SplatVal);
1337
1338 // If x and/or y is a splat value then:
1339 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1340 Value *LHS, *RHS;
1341 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1342 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1343 auto *OldBinOp = cast<BinaryOperator>(Vec);
1344 auto OpC = OldBinOp->getOpcode();
1345 auto *NewLHS =
1346 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1347 auto *NewRHS =
1348 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1350 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1351 return IC.replaceInstUsesWith(II, NewBinOp);
1352 }
1353 }
1354
1355 auto *C = dyn_cast<Constant>(Pg);
1356 if (IsAfter && C && C->isNullValue()) {
1357 // The intrinsic is extracting lane 0 so use an extract instead.
1358 auto *IdxTy = Type::getInt64Ty(II.getContext());
1359 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1360 Extract->insertBefore(II.getIterator());
1361 Extract->takeName(&II);
1362 return IC.replaceInstUsesWith(II, Extract);
1363 }
1364
1365 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1366 if (!IntrPG)
1367 return std::nullopt;
1368
1369 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1370 return std::nullopt;
1371
1372 const auto PTruePattern =
1373 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1374
1375 // Can the intrinsic's predicate be converted to a known constant index?
1376 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1377 if (!MinNumElts)
1378 return std::nullopt;
1379
1380 unsigned Idx = MinNumElts - 1;
1381 // Increment the index if extracting the element after the last active
1382 // predicate element.
1383 if (IsAfter)
1384 ++Idx;
1385
1386 // Ignore extracts whose index is larger than the known minimum vector
1387 // length. NOTE: This is an artificial constraint where we prefer to
1388 // maintain what the user asked for until an alternative is proven faster.
1389 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1390 if (Idx >= PgVTy->getMinNumElements())
1391 return std::nullopt;
1392
1393 // The intrinsic is extracting a fixed lane so use an extract instead.
1394 auto *IdxTy = Type::getInt64Ty(II.getContext());
1395 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1396 Extract->insertBefore(II.getIterator());
1397 Extract->takeName(&II);
1398 return IC.replaceInstUsesWith(II, Extract);
1399}
1400
1401static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1402 IntrinsicInst &II) {
1403 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1404 // integer variant across a variety of micro-architectures. Replace scalar
1405 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1406 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1407 // depending on the micro-architecture, but has been observed as generally
1408 // being faster, particularly when the CLAST[AB] op is a loop-carried
1409 // dependency.
1410 Value *Pg = II.getArgOperand(0);
1411 Value *Fallback = II.getArgOperand(1);
1412 Value *Vec = II.getArgOperand(2);
1413 Type *Ty = II.getType();
1414
1415 if (!Ty->isIntegerTy())
1416 return std::nullopt;
1417
1418 Type *FPTy;
1419 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1420 default:
1421 return std::nullopt;
1422 case 16:
1423 FPTy = IC.Builder.getHalfTy();
1424 break;
1425 case 32:
1426 FPTy = IC.Builder.getFloatTy();
1427 break;
1428 case 64:
1429 FPTy = IC.Builder.getDoubleTy();
1430 break;
1431 }
1432
1433 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1434 auto *FPVTy = VectorType::get(
1435 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1436 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1437 auto *FPII = IC.Builder.CreateIntrinsic(
1438 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1439 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1440 return IC.replaceInstUsesWith(II, FPIItoInt);
1441}
1442
1443static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1444 IntrinsicInst &II) {
1445 LLVMContext &Ctx = II.getContext();
1446 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1447 // can work with RDFFR_PP for ptest elimination.
1448 auto *AllPat =
1449 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1450 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1451 {II.getType()}, {AllPat});
1452 auto *RDFFR =
1453 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1454 RDFFR->takeName(&II);
1455 return IC.replaceInstUsesWith(II, RDFFR);
1456}
1457
1458static std::optional<Instruction *>
1460 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1461
1462 if (Pattern == AArch64SVEPredPattern::all) {
1463 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1464 auto *VScale = IC.Builder.CreateVScale(StepVal);
1465 VScale->takeName(&II);
1466 return IC.replaceInstUsesWith(II, VScale);
1467 }
1468
1469 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1470
1471 return MinNumElts && NumElts >= MinNumElts
1472 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1473 II, ConstantInt::get(II.getType(), MinNumElts)))
1474 : std::nullopt;
1475}
1476
1477static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1478 IntrinsicInst &II) {
1479 Value *PgVal = II.getArgOperand(0);
1480 Value *OpVal = II.getArgOperand(1);
1481
1482 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1483 // Later optimizations prefer this form.
1484 if (PgVal == OpVal &&
1485 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1486 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1487 Value *Ops[] = {PgVal, OpVal};
1488 Type *Tys[] = {PgVal->getType()};
1489
1490 auto *PTest =
1491 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1492 PTest->takeName(&II);
1493
1494 return IC.replaceInstUsesWith(II, PTest);
1495 }
1496
1497 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1498 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1499
1500 if (!Pg || !Op)
1501 return std::nullopt;
1502
1503 Intrinsic::ID OpIID = Op->getIntrinsicID();
1504
1505 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1506 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1507 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1508 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1509 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1510
1511 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1512
1513 PTest->takeName(&II);
1514 return IC.replaceInstUsesWith(II, PTest);
1515 }
1516
1517 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1518 // Later optimizations may rewrite sequence to use the flag-setting variant
1519 // of instruction X to remove PTEST.
1520 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1521 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1522 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1523 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1524 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1525 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1526 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1527 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1528 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1529 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1530 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1531 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1532 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1533 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1534 Type *Tys[] = {Pg->getType()};
1535
1536 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1537 PTest->takeName(&II);
1538
1539 return IC.replaceInstUsesWith(II, PTest);
1540 }
1541
1542 return std::nullopt;
1543}
1544
1545template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1546static std::optional<Instruction *>
1548 bool MergeIntoAddendOp) {
1549 Value *P = II.getOperand(0);
1550 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1551 if (MergeIntoAddendOp) {
1552 AddendOp = II.getOperand(1);
1553 Mul = II.getOperand(2);
1554 } else {
1555 AddendOp = II.getOperand(2);
1556 Mul = II.getOperand(1);
1557 }
1558
1559 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1560 m_Value(MulOp1))))
1561 return std::nullopt;
1562
1563 if (!Mul->hasOneUse())
1564 return std::nullopt;
1565
1566 Instruction *FMFSource = nullptr;
1567 if (II.getType()->isFPOrFPVectorTy()) {
1568 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1569 // Stop the combine when the flags on the inputs differ in case dropping
1570 // flags would lead to us missing out on more beneficial optimizations.
1571 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1572 return std::nullopt;
1573 if (!FAddFlags.allowContract())
1574 return std::nullopt;
1575 FMFSource = &II;
1576 }
1577
1578 CallInst *Res;
1579 if (MergeIntoAddendOp)
1580 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1581 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1582 else
1583 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1584 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1585
1586 return IC.replaceInstUsesWith(II, Res);
1587}
1588
1589static std::optional<Instruction *>
1591 Value *Pred = II.getOperand(0);
1592 Value *PtrOp = II.getOperand(1);
1593 Type *VecTy = II.getType();
1594
1595 // Replace by zero constant when all lanes are inactive
1596 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1597 return II_NA;
1598
1599 if (isAllActivePredicate(Pred)) {
1600 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1601 Load->copyMetadata(II);
1602 return IC.replaceInstUsesWith(II, Load);
1603 }
1604
1605 CallInst *MaskedLoad =
1606 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1607 Pred, ConstantAggregateZero::get(VecTy));
1608 MaskedLoad->copyMetadata(II);
1609 return IC.replaceInstUsesWith(II, MaskedLoad);
1610}
1611
1612static std::optional<Instruction *>
1614 Value *VecOp = II.getOperand(0);
1615 Value *Pred = II.getOperand(1);
1616 Value *PtrOp = II.getOperand(2);
1617
1618 if (isAllActivePredicate(Pred)) {
1619 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1620 Store->copyMetadata(II);
1621 return IC.eraseInstFromFunction(II);
1622 }
1623
1624 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1625 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1626 MaskedStore->copyMetadata(II);
1627 return IC.eraseInstFromFunction(II);
1628}
1629
1631 switch (Intrinsic) {
1632 case Intrinsic::aarch64_sve_fmul_u:
1633 return Instruction::BinaryOps::FMul;
1634 case Intrinsic::aarch64_sve_fadd_u:
1635 return Instruction::BinaryOps::FAdd;
1636 case Intrinsic::aarch64_sve_fsub_u:
1637 return Instruction::BinaryOps::FSub;
1638 default:
1639 return Instruction::BinaryOpsEnd;
1640 }
1641}
1642
1643static std::optional<Instruction *>
1645 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1646 if (II.isStrictFP())
1647 return std::nullopt;
1648
1649 auto *OpPredicate = II.getOperand(0);
1650 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1651 if (BinOpCode == Instruction::BinaryOpsEnd ||
1652 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1653 m_ConstantInt<AArch64SVEPredPattern::all>())))
1654 return std::nullopt;
1655 auto BinOp = IC.Builder.CreateBinOpFMF(
1656 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
1657 return IC.replaceInstUsesWith(II, BinOp);
1658}
1659
1660// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1661// sve.add_u).
1662static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1663 Intrinsic::ID IID) {
1664 auto *OpPredicate = II.getOperand(0);
1665 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1666 m_ConstantInt<AArch64SVEPredPattern::all>())))
1667 return std::nullopt;
1668
1669 auto *Mod = II.getModule();
1670 auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
1671 II.setCalledFunction(NewDecl);
1672
1673 return &II;
1674}
1675
1676// Simplify operations where predicate has all inactive lanes or try to replace
1677// with _u form when all lanes are active
1678static std::optional<Instruction *>
1680 Intrinsic::ID IID) {
1681 if (match(II.getOperand(0), m_ZeroInt())) {
1682 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1683 // inactive for sv[func]_m
1684 return IC.replaceInstUsesWith(II, II.getOperand(1));
1685 }
1686 return instCombineSVEAllActive(II, IID);
1687}
1688
1689static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1690 IntrinsicInst &II) {
1691 if (auto II_U =
1692 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1693 return II_U;
1694 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1695 Intrinsic::aarch64_sve_mla>(
1696 IC, II, true))
1697 return MLA;
1698 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1699 Intrinsic::aarch64_sve_mad>(
1700 IC, II, false))
1701 return MAD;
1702 return std::nullopt;
1703}
1704
1705static std::optional<Instruction *>
1707 if (auto II_U =
1708 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1709 return II_U;
1710 if (auto FMLA =
1711 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1712 Intrinsic::aarch64_sve_fmla>(IC, II,
1713 true))
1714 return FMLA;
1715 if (auto FMAD =
1716 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1717 Intrinsic::aarch64_sve_fmad>(IC, II,
1718 false))
1719 return FMAD;
1720 if (auto FMLA =
1721 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1722 Intrinsic::aarch64_sve_fmla>(IC, II,
1723 true))
1724 return FMLA;
1725 return std::nullopt;
1726}
1727
1728static std::optional<Instruction *>
1730 if (auto FMLA =
1731 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1732 Intrinsic::aarch64_sve_fmla>(IC, II,
1733 true))
1734 return FMLA;
1735 if (auto FMAD =
1736 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1737 Intrinsic::aarch64_sve_fmad>(IC, II,
1738 false))
1739 return FMAD;
1740 if (auto FMLA_U =
1741 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1742 Intrinsic::aarch64_sve_fmla_u>(
1743 IC, II, true))
1744 return FMLA_U;
1745 return instCombineSVEVectorBinOp(IC, II);
1746}
1747
1748static std::optional<Instruction *>
1750 if (auto II_U =
1751 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1752 return II_U;
1753 if (auto FMLS =
1754 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1755 Intrinsic::aarch64_sve_fmls>(IC, II,
1756 true))
1757 return FMLS;
1758 if (auto FMSB =
1759 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1760 Intrinsic::aarch64_sve_fnmsb>(
1761 IC, II, false))
1762 return FMSB;
1763 if (auto FMLS =
1764 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1765 Intrinsic::aarch64_sve_fmls>(IC, II,
1766 true))
1767 return FMLS;
1768 return std::nullopt;
1769}
1770
1771static std::optional<Instruction *>
1773 if (auto FMLS =
1774 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1775 Intrinsic::aarch64_sve_fmls>(IC, II,
1776 true))
1777 return FMLS;
1778 if (auto FMSB =
1779 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1780 Intrinsic::aarch64_sve_fnmsb>(
1781 IC, II, false))
1782 return FMSB;
1783 if (auto FMLS_U =
1784 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1785 Intrinsic::aarch64_sve_fmls_u>(
1786 IC, II, true))
1787 return FMLS_U;
1788 return instCombineSVEVectorBinOp(IC, II);
1789}
1790
1791static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1792 IntrinsicInst &II) {
1793 if (auto II_U =
1794 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1795 return II_U;
1796 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1797 Intrinsic::aarch64_sve_mls>(
1798 IC, II, true))
1799 return MLS;
1800 return std::nullopt;
1801}
1802
1803static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1805 Intrinsic::ID IID) {
1806 auto *OpPredicate = II.getOperand(0);
1807 auto *OpMultiplicand = II.getOperand(1);
1808 auto *OpMultiplier = II.getOperand(2);
1809
1810 // Return true if a given instruction is a unit splat value, false otherwise.
1811 auto IsUnitSplat = [](auto *I) {
1812 auto *SplatValue = getSplatValue(I);
1813 if (!SplatValue)
1814 return false;
1815 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1816 };
1817
1818 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1819 // with a unit splat value, false otherwise.
1820 auto IsUnitDup = [](auto *I) {
1821 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1822 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1823 return false;
1824
1825 auto *SplatValue = IntrI->getOperand(2);
1826 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1827 };
1828
1829 if (IsUnitSplat(OpMultiplier)) {
1830 // [f]mul pg %n, (dupx 1) => %n
1831 OpMultiplicand->takeName(&II);
1832 return IC.replaceInstUsesWith(II, OpMultiplicand);
1833 } else if (IsUnitDup(OpMultiplier)) {
1834 // [f]mul pg %n, (dup pg 1) => %n
1835 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1836 auto *DupPg = DupInst->getOperand(1);
1837 // TODO: this is naive. The optimization is still valid if DupPg
1838 // 'encompasses' OpPredicate, not only if they're the same predicate.
1839 if (OpPredicate == DupPg) {
1840 OpMultiplicand->takeName(&II);
1841 return IC.replaceInstUsesWith(II, OpMultiplicand);
1842 }
1843 }
1844
1845 return instCombineSVEVectorBinOp(IC, II);
1846}
1847
1848static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1849 IntrinsicInst &II) {
1850 Value *UnpackArg = II.getArgOperand(0);
1851 auto *RetTy = cast<ScalableVectorType>(II.getType());
1852 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1853 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1854
1855 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1856 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1857 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1858 ScalarArg =
1859 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1860 Value *NewVal =
1861 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1862 NewVal->takeName(&II);
1863 return IC.replaceInstUsesWith(II, NewVal);
1864 }
1865
1866 return std::nullopt;
1867}
1868static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1869 IntrinsicInst &II) {
1870 auto *OpVal = II.getOperand(0);
1871 auto *OpIndices = II.getOperand(1);
1872 VectorType *VTy = cast<VectorType>(II.getType());
1873
1874 // Check whether OpIndices is a constant splat value < minimal element count
1875 // of result.
1876 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1877 if (!SplatValue ||
1878 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1879 return std::nullopt;
1880
1881 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1882 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1883 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1884 auto *VectorSplat =
1885 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1886
1887 VectorSplat->takeName(&II);
1888 return IC.replaceInstUsesWith(II, VectorSplat);
1889}
1890
1891static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1892 IntrinsicInst &II) {
1893 Value *A, *B;
1894 Type *RetTy = II.getType();
1895 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1896 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1897
1898 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1899 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1900 if ((match(II.getArgOperand(0),
1901 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1902 match(II.getArgOperand(1),
1903 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1904 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1905 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1906 auto *TyA = cast<ScalableVectorType>(A->getType());
1907 if (TyA == B->getType() &&
1909 auto *SubVec = IC.Builder.CreateInsertVector(
1911 auto *ConcatVec = IC.Builder.CreateInsertVector(
1912 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1913 ConcatVec->takeName(&II);
1914 return IC.replaceInstUsesWith(II, ConcatVec);
1915 }
1916 }
1917
1918 return std::nullopt;
1919}
1920
1921static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1922 IntrinsicInst &II) {
1923 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1924 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1925 Value *A, *B;
1926 if (match(II.getArgOperand(0),
1927 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1928 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1929 m_Specific(A), m_Specific(B))))
1930 return IC.replaceInstUsesWith(
1931 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1932
1933 return std::nullopt;
1934}
1935
1936static std::optional<Instruction *>
1938 Value *Mask = II.getOperand(0);
1939 Value *BasePtr = II.getOperand(1);
1940 Value *Index = II.getOperand(2);
1941 Type *Ty = II.getType();
1942 Value *PassThru = ConstantAggregateZero::get(Ty);
1943
1944 // Replace by zero constant when all lanes are inactive
1945 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1946 return II_NA;
1947
1948 // Contiguous gather => masked load.
1949 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1950 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1951 Value *IndexBase;
1952 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1953 m_Value(IndexBase), m_SpecificInt(1)))) {
1954 Align Alignment =
1955 BasePtr->getPointerAlignment(II.getDataLayout());
1956
1957 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1958 BasePtr, IndexBase);
1959 CallInst *MaskedLoad =
1960 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1961 MaskedLoad->takeName(&II);
1962 return IC.replaceInstUsesWith(II, MaskedLoad);
1963 }
1964
1965 return std::nullopt;
1966}
1967
1968static std::optional<Instruction *>
1970 Value *Val = II.getOperand(0);
1971 Value *Mask = II.getOperand(1);
1972 Value *BasePtr = II.getOperand(2);
1973 Value *Index = II.getOperand(3);
1974 Type *Ty = Val->getType();
1975
1976 // Contiguous scatter => masked store.
1977 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1978 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1979 Value *IndexBase;
1980 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1981 m_Value(IndexBase), m_SpecificInt(1)))) {
1982 Align Alignment =
1983 BasePtr->getPointerAlignment(II.getDataLayout());
1984
1985 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1986 BasePtr, IndexBase);
1987 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1988
1989 return IC.eraseInstFromFunction(II);
1990 }
1991
1992 return std::nullopt;
1993}
1994
1995static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1996 IntrinsicInst &II) {
1997 Type *Int32Ty = IC.Builder.getInt32Ty();
1998 Value *Pred = II.getOperand(0);
1999 Value *Vec = II.getOperand(1);
2000 Value *DivVec = II.getOperand(2);
2001
2002 Value *SplatValue = getSplatValue(DivVec);
2003 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2004 if (!SplatConstantInt)
2005 return std::nullopt;
2006
2007 APInt Divisor = SplatConstantInt->getValue();
2008 const int64_t DivisorValue = Divisor.getSExtValue();
2009 if (DivisorValue == -1)
2010 return std::nullopt;
2011 if (DivisorValue == 1)
2012 IC.replaceInstUsesWith(II, Vec);
2013
2014 if (Divisor.isPowerOf2()) {
2015 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2016 auto ASRD = IC.Builder.CreateIntrinsic(
2017 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2018 return IC.replaceInstUsesWith(II, ASRD);
2019 }
2020 if (Divisor.isNegatedPowerOf2()) {
2021 Divisor.negate();
2022 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2023 auto ASRD = IC.Builder.CreateIntrinsic(
2024 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2025 auto NEG = IC.Builder.CreateIntrinsic(
2026 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2027 return IC.replaceInstUsesWith(II, NEG);
2028 }
2029
2030 return std::nullopt;
2031}
2032
2033bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2034 size_t VecSize = Vec.size();
2035 if (VecSize == 1)
2036 return true;
2037 if (!isPowerOf2_64(VecSize))
2038 return false;
2039 size_t HalfVecSize = VecSize / 2;
2040
2041 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2042 RHS != Vec.end(); LHS++, RHS++) {
2043 if (*LHS != nullptr && *RHS != nullptr) {
2044 if (*LHS == *RHS)
2045 continue;
2046 else
2047 return false;
2048 }
2049 if (!AllowPoison)
2050 return false;
2051 if (*LHS == nullptr && *RHS != nullptr)
2052 *LHS = *RHS;
2053 }
2054
2055 Vec.resize(HalfVecSize);
2056 SimplifyValuePattern(Vec, AllowPoison);
2057 return true;
2058}
2059
2060// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2061// to dupqlane(f64(C)) where C is A concatenated with B
2062static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2063 IntrinsicInst &II) {
2064 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2065 if (!match(II.getOperand(0),
2066 m_Intrinsic<Intrinsic::vector_insert>(
2067 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2068 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2069 return std::nullopt;
2070 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2071
2072 // Insert the scalars into a container ordered by InsertElement index
2073 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2074 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2075 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2076 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2077 CurrentInsertElt = InsertElt->getOperand(0);
2078 }
2079
2080 bool AllowPoison =
2081 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2082 if (!SimplifyValuePattern(Elts, AllowPoison))
2083 return std::nullopt;
2084
2085 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2086 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2087 for (size_t I = 0; I < Elts.size(); I++) {
2088 if (Elts[I] == nullptr)
2089 continue;
2090 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2091 IC.Builder.getInt64(I));
2092 }
2093 if (InsertEltChain == nullptr)
2094 return std::nullopt;
2095
2096 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2097 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2098 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2099 // be narrowed back to the original type.
2100 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2101 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2102 IIScalableTy->getMinNumElements() /
2103 PatternWidth;
2104
2105 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2106 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2107 auto *WideShuffleMaskTy =
2108 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2109
2110 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2111 auto InsertSubvector = IC.Builder.CreateInsertVector(
2112 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2113 auto WideBitcast =
2114 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2115 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2116 auto WideShuffle = IC.Builder.CreateShuffleVector(
2117 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2118 auto NarrowBitcast =
2119 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2120
2121 return IC.replaceInstUsesWith(II, NarrowBitcast);
2122}
2123
2124static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2125 IntrinsicInst &II) {
2126 Value *A = II.getArgOperand(0);
2127 Value *B = II.getArgOperand(1);
2128 if (A == B)
2129 return IC.replaceInstUsesWith(II, A);
2130
2131 return std::nullopt;
2132}
2133
2134static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2135 IntrinsicInst &II) {
2136 Value *Pred = II.getOperand(0);
2137 Value *Vec = II.getOperand(1);
2138 Value *Shift = II.getOperand(2);
2139
2140 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2141 Value *AbsPred, *MergedValue;
2142 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2143 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2144 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2145 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2146
2147 return std::nullopt;
2148
2149 // Transform is valid if any of the following are true:
2150 // * The ABS merge value is an undef or non-negative
2151 // * The ABS predicate is all active
2152 // * The ABS predicate and the SRSHL predicates are the same
2153 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2154 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2155 return std::nullopt;
2156
2157 // Only valid when the shift amount is non-negative, otherwise the rounding
2158 // behaviour of SRSHL cannot be ignored.
2159 if (!match(Shift, m_NonNegative()))
2160 return std::nullopt;
2161
2162 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2163 {II.getType()}, {Pred, Vec, Shift});
2164
2165 return IC.replaceInstUsesWith(II, LSL);
2166}
2167
2168static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2169 IntrinsicInst &II) {
2170 Value *Vec = II.getOperand(0);
2171
2172 if (getSplatValue(Vec) == II.getOperand(1))
2173 return IC.replaceInstUsesWith(II, Vec);
2174
2175 return std::nullopt;
2176}
2177
2178static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2179 IntrinsicInst &II) {
2180 // If this barrier is post-dominated by identical one we can remove it
2181 auto *NI = II.getNextNonDebugInstruction();
2182 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2183 auto CanSkipOver = [](Instruction *I) {
2184 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2185 };
2186 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2187 auto *NIBB = NI->getParent();
2188 NI = NI->getNextNonDebugInstruction();
2189 if (!NI) {
2190 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2191 NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
2192 else
2193 break;
2194 }
2195 }
2196 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2197 if (NextII && II.isIdenticalTo(NextII))
2198 return IC.eraseInstFromFunction(II);
2199
2200 return std::nullopt;
2201}
2202
2203std::optional<Instruction *>
2205 IntrinsicInst &II) const {
2206 Intrinsic::ID IID = II.getIntrinsicID();
2207 switch (IID) {
2208 default:
2209 break;
2210 case Intrinsic::aarch64_dmb:
2211 return instCombineDMB(IC, II);
2212 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2213 case Intrinsic::aarch64_sve_fcvt_f16f32:
2214 case Intrinsic::aarch64_sve_fcvt_f16f64:
2215 case Intrinsic::aarch64_sve_fcvt_f32f16:
2216 case Intrinsic::aarch64_sve_fcvt_f32f64:
2217 case Intrinsic::aarch64_sve_fcvt_f64f16:
2218 case Intrinsic::aarch64_sve_fcvt_f64f32:
2219 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2220 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2221 case Intrinsic::aarch64_sve_fcvtx_f32f64:
2222 case Intrinsic::aarch64_sve_fcvtzs:
2223 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2224 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2225 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2226 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2227 case Intrinsic::aarch64_sve_fcvtzu:
2228 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2229 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2230 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2231 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2232 case Intrinsic::aarch64_sve_scvtf:
2233 case Intrinsic::aarch64_sve_scvtf_f16i32:
2234 case Intrinsic::aarch64_sve_scvtf_f16i64:
2235 case Intrinsic::aarch64_sve_scvtf_f32i64:
2236 case Intrinsic::aarch64_sve_scvtf_f64i32:
2237 case Intrinsic::aarch64_sve_ucvtf:
2238 case Intrinsic::aarch64_sve_ucvtf_f16i32:
2239 case Intrinsic::aarch64_sve_ucvtf_f16i64:
2240 case Intrinsic::aarch64_sve_ucvtf_f32i64:
2241 case Intrinsic::aarch64_sve_ucvtf_f64i32:
2243 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2244 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2245 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2246 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2247 return instCombineSVENoActiveReplace(IC, II, true);
2248 case Intrinsic::aarch64_sve_st1_scatter:
2249 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2250 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2251 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2252 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2253 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2254 case Intrinsic::aarch64_sve_st1dq:
2255 case Intrinsic::aarch64_sve_st1q_scatter_index:
2256 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2257 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2258 case Intrinsic::aarch64_sve_st1wq:
2259 case Intrinsic::aarch64_sve_stnt1:
2260 case Intrinsic::aarch64_sve_stnt1_scatter:
2261 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2262 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2263 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2264 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2265 case Intrinsic::aarch64_sve_st2:
2266 case Intrinsic::aarch64_sve_st2q:
2267 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2268 case Intrinsic::aarch64_sve_st3:
2269 case Intrinsic::aarch64_sve_st3q:
2270 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2271 case Intrinsic::aarch64_sve_st4:
2272 case Intrinsic::aarch64_sve_st4q:
2273 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2274 case Intrinsic::aarch64_sve_addqv:
2275 case Intrinsic::aarch64_sve_and_z:
2276 case Intrinsic::aarch64_sve_bic_z:
2277 case Intrinsic::aarch64_sve_brka_z:
2278 case Intrinsic::aarch64_sve_brkb_z:
2279 case Intrinsic::aarch64_sve_brkn_z:
2280 case Intrinsic::aarch64_sve_brkpa_z:
2281 case Intrinsic::aarch64_sve_brkpb_z:
2282 case Intrinsic::aarch64_sve_cntp:
2283 case Intrinsic::aarch64_sve_compact:
2284 case Intrinsic::aarch64_sve_eor_z:
2285 case Intrinsic::aarch64_sve_eorv:
2286 case Intrinsic::aarch64_sve_eorqv:
2287 case Intrinsic::aarch64_sve_nand_z:
2288 case Intrinsic::aarch64_sve_nor_z:
2289 case Intrinsic::aarch64_sve_orn_z:
2290 case Intrinsic::aarch64_sve_orr_z:
2291 case Intrinsic::aarch64_sve_orv:
2292 case Intrinsic::aarch64_sve_orqv:
2293 case Intrinsic::aarch64_sve_pnext:
2294 case Intrinsic::aarch64_sve_rdffr_z:
2295 case Intrinsic::aarch64_sve_saddv:
2296 case Intrinsic::aarch64_sve_uaddv:
2297 case Intrinsic::aarch64_sve_umaxv:
2298 case Intrinsic::aarch64_sve_umaxqv:
2299 case Intrinsic::aarch64_sve_cmpeq:
2300 case Intrinsic::aarch64_sve_cmpeq_wide:
2301 case Intrinsic::aarch64_sve_cmpge:
2302 case Intrinsic::aarch64_sve_cmpge_wide:
2303 case Intrinsic::aarch64_sve_cmpgt:
2304 case Intrinsic::aarch64_sve_cmpgt_wide:
2305 case Intrinsic::aarch64_sve_cmphi:
2306 case Intrinsic::aarch64_sve_cmphi_wide:
2307 case Intrinsic::aarch64_sve_cmphs:
2308 case Intrinsic::aarch64_sve_cmphs_wide:
2309 case Intrinsic::aarch64_sve_cmple_wide:
2310 case Intrinsic::aarch64_sve_cmplo_wide:
2311 case Intrinsic::aarch64_sve_cmpls_wide:
2312 case Intrinsic::aarch64_sve_cmplt_wide:
2313 case Intrinsic::aarch64_sve_facge:
2314 case Intrinsic::aarch64_sve_facgt:
2315 case Intrinsic::aarch64_sve_fcmpeq:
2316 case Intrinsic::aarch64_sve_fcmpge:
2317 case Intrinsic::aarch64_sve_fcmpgt:
2318 case Intrinsic::aarch64_sve_fcmpne:
2319 case Intrinsic::aarch64_sve_fcmpuo:
2320 case Intrinsic::aarch64_sve_ld1_gather:
2321 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2322 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2323 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2324 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2325 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2326 case Intrinsic::aarch64_sve_ld1q_gather_index:
2327 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2328 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2329 case Intrinsic::aarch64_sve_ld1ro:
2330 case Intrinsic::aarch64_sve_ld1rq:
2331 case Intrinsic::aarch64_sve_ld1udq:
2332 case Intrinsic::aarch64_sve_ld1uwq:
2333 case Intrinsic::aarch64_sve_ld2_sret:
2334 case Intrinsic::aarch64_sve_ld2q_sret:
2335 case Intrinsic::aarch64_sve_ld3_sret:
2336 case Intrinsic::aarch64_sve_ld3q_sret:
2337 case Intrinsic::aarch64_sve_ld4_sret:
2338 case Intrinsic::aarch64_sve_ld4q_sret:
2339 case Intrinsic::aarch64_sve_ldff1:
2340 case Intrinsic::aarch64_sve_ldff1_gather:
2341 case Intrinsic::aarch64_sve_ldff1_gather_index:
2342 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2343 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2344 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2345 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2346 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2347 case Intrinsic::aarch64_sve_ldnf1:
2348 case Intrinsic::aarch64_sve_ldnt1:
2349 case Intrinsic::aarch64_sve_ldnt1_gather:
2350 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2351 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2352 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2353 return instCombineSVENoActiveZero(IC, II);
2354 case Intrinsic::aarch64_sve_prf:
2355 case Intrinsic::aarch64_sve_prfb_gather_index:
2356 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2357 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2358 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2359 case Intrinsic::aarch64_sve_prfd_gather_index:
2360 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2361 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2362 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2363 case Intrinsic::aarch64_sve_prfh_gather_index:
2364 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2365 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2366 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2367 case Intrinsic::aarch64_sve_prfw_gather_index:
2368 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2369 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2370 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2371 return instCombineSVENoActiveUnaryErase(IC, II, 0);
2372 case Intrinsic::aarch64_neon_fmaxnm:
2373 case Intrinsic::aarch64_neon_fminnm:
2374 return instCombineMaxMinNM(IC, II);
2375 case Intrinsic::aarch64_sve_convert_from_svbool:
2376 return instCombineConvertFromSVBool(IC, II);
2377 case Intrinsic::aarch64_sve_dup:
2378 return instCombineSVEDup(IC, II);
2379 case Intrinsic::aarch64_sve_dup_x:
2380 return instCombineSVEDupX(IC, II);
2381 case Intrinsic::aarch64_sve_cmpne:
2382 case Intrinsic::aarch64_sve_cmpne_wide:
2383 return instCombineSVECmpNE(IC, II);
2384 case Intrinsic::aarch64_sve_rdffr:
2385 return instCombineRDFFR(IC, II);
2386 case Intrinsic::aarch64_sve_lasta:
2387 case Intrinsic::aarch64_sve_lastb:
2388 return instCombineSVELast(IC, II);
2389 case Intrinsic::aarch64_sve_clasta_n:
2390 case Intrinsic::aarch64_sve_clastb_n:
2391 return instCombineSVECondLast(IC, II);
2392 case Intrinsic::aarch64_sve_cntd:
2393 return instCombineSVECntElts(IC, II, 2);
2394 case Intrinsic::aarch64_sve_cntw:
2395 return instCombineSVECntElts(IC, II, 4);
2396 case Intrinsic::aarch64_sve_cnth:
2397 return instCombineSVECntElts(IC, II, 8);
2398 case Intrinsic::aarch64_sve_cntb:
2399 return instCombineSVECntElts(IC, II, 16);
2400 case Intrinsic::aarch64_sve_ptest_any:
2401 case Intrinsic::aarch64_sve_ptest_first:
2402 case Intrinsic::aarch64_sve_ptest_last:
2403 return instCombineSVEPTest(IC, II);
2404 case Intrinsic::aarch64_sve_fabd:
2405 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2406 case Intrinsic::aarch64_sve_fadd:
2407 return instCombineSVEVectorFAdd(IC, II);
2408 case Intrinsic::aarch64_sve_fadd_u:
2409 return instCombineSVEVectorFAddU(IC, II);
2410 case Intrinsic::aarch64_sve_fdiv:
2411 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2412 case Intrinsic::aarch64_sve_fmax:
2413 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2414 case Intrinsic::aarch64_sve_fmaxnm:
2415 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2416 case Intrinsic::aarch64_sve_fmin:
2417 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2418 case Intrinsic::aarch64_sve_fminnm:
2419 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2420 case Intrinsic::aarch64_sve_fmla:
2421 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2422 case Intrinsic::aarch64_sve_fmls:
2423 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2424 case Intrinsic::aarch64_sve_fmul:
2425 if (auto II_U =
2426 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2427 return II_U;
2428 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2429 case Intrinsic::aarch64_sve_fmul_u:
2430 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2431 case Intrinsic::aarch64_sve_fmulx:
2432 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2433 case Intrinsic::aarch64_sve_fnmla:
2434 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2435 case Intrinsic::aarch64_sve_fnmls:
2436 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2437 case Intrinsic::aarch64_sve_fsub:
2438 return instCombineSVEVectorFSub(IC, II);
2439 case Intrinsic::aarch64_sve_fsub_u:
2440 return instCombineSVEVectorFSubU(IC, II);
2441 case Intrinsic::aarch64_sve_add:
2442 return instCombineSVEVectorAdd(IC, II);
2443 case Intrinsic::aarch64_sve_add_u:
2444 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2445 Intrinsic::aarch64_sve_mla_u>(
2446 IC, II, true);
2447 case Intrinsic::aarch64_sve_mla:
2448 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2449 case Intrinsic::aarch64_sve_mls:
2450 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2451 case Intrinsic::aarch64_sve_mul:
2452 if (auto II_U =
2453 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2454 return II_U;
2455 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2456 case Intrinsic::aarch64_sve_mul_u:
2457 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2458 case Intrinsic::aarch64_sve_sabd:
2459 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2460 case Intrinsic::aarch64_sve_smax:
2461 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2462 case Intrinsic::aarch64_sve_smin:
2463 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2464 case Intrinsic::aarch64_sve_smulh:
2465 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2466 case Intrinsic::aarch64_sve_sub:
2467 return instCombineSVEVectorSub(IC, II);
2468 case Intrinsic::aarch64_sve_sub_u:
2469 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2470 Intrinsic::aarch64_sve_mls_u>(
2471 IC, II, true);
2472 case Intrinsic::aarch64_sve_uabd:
2473 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2474 case Intrinsic::aarch64_sve_umax:
2475 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2476 case Intrinsic::aarch64_sve_umin:
2477 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2478 case Intrinsic::aarch64_sve_umulh:
2479 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2480 case Intrinsic::aarch64_sve_asr:
2481 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2482 case Intrinsic::aarch64_sve_lsl:
2483 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2484 case Intrinsic::aarch64_sve_lsr:
2485 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2486 case Intrinsic::aarch64_sve_and:
2487 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2488 case Intrinsic::aarch64_sve_bic:
2489 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2490 case Intrinsic::aarch64_sve_eor:
2491 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2492 case Intrinsic::aarch64_sve_orr:
2493 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2494 case Intrinsic::aarch64_sve_sqsub:
2495 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2496 case Intrinsic::aarch64_sve_uqsub:
2497 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2498 case Intrinsic::aarch64_sve_tbl:
2499 return instCombineSVETBL(IC, II);
2500 case Intrinsic::aarch64_sve_uunpkhi:
2501 case Intrinsic::aarch64_sve_uunpklo:
2502 case Intrinsic::aarch64_sve_sunpkhi:
2503 case Intrinsic::aarch64_sve_sunpklo:
2504 return instCombineSVEUnpack(IC, II);
2505 case Intrinsic::aarch64_sve_uzp1:
2506 return instCombineSVEUzp1(IC, II);
2507 case Intrinsic::aarch64_sve_zip1:
2508 case Intrinsic::aarch64_sve_zip2:
2509 return instCombineSVEZip(IC, II);
2510 case Intrinsic::aarch64_sve_ld1_gather_index:
2511 return instCombineLD1GatherIndex(IC, II);
2512 case Intrinsic::aarch64_sve_st1_scatter_index:
2513 return instCombineST1ScatterIndex(IC, II);
2514 case Intrinsic::aarch64_sve_ld1:
2515 return instCombineSVELD1(IC, II, DL);
2516 case Intrinsic::aarch64_sve_st1:
2517 return instCombineSVEST1(IC, II, DL);
2518 case Intrinsic::aarch64_sve_sdiv:
2519 return instCombineSVESDIV(IC, II);
2520 case Intrinsic::aarch64_sve_sel:
2521 return instCombineSVESel(IC, II);
2522 case Intrinsic::aarch64_sve_srshl:
2523 return instCombineSVESrshl(IC, II);
2524 case Intrinsic::aarch64_sve_dupq_lane:
2525 return instCombineSVEDupqLane(IC, II);
2526 case Intrinsic::aarch64_sve_insr:
2527 return instCombineSVEInsr(IC, II);
2528 }
2529
2530 return std::nullopt;
2531}
2532
2534 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2535 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2536 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2537 SimplifyAndSetOp) const {
2538 switch (II.getIntrinsicID()) {
2539 default:
2540 break;
2541 case Intrinsic::aarch64_neon_fcvtxn:
2542 case Intrinsic::aarch64_neon_rshrn:
2543 case Intrinsic::aarch64_neon_sqrshrn:
2544 case Intrinsic::aarch64_neon_sqrshrun:
2545 case Intrinsic::aarch64_neon_sqshrn:
2546 case Intrinsic::aarch64_neon_sqshrun:
2547 case Intrinsic::aarch64_neon_sqxtn:
2548 case Intrinsic::aarch64_neon_sqxtun:
2549 case Intrinsic::aarch64_neon_uqrshrn:
2550 case Intrinsic::aarch64_neon_uqshrn:
2551 case Intrinsic::aarch64_neon_uqxtn:
2552 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2553 break;
2554 }
2555
2556 return std::nullopt;
2557}
2558
2560 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2562}
2563
2566 switch (K) {
2568 return TypeSize::getFixed(64);
2570 if (ST->useSVEForFixedLengthVectors() &&
2572 return TypeSize::getFixed(
2573 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2574 else if (ST->isNeonAvailable())
2575 return TypeSize::getFixed(128);
2576 else
2577 return TypeSize::getFixed(0);
2579 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2581 return TypeSize::getScalable(128);
2582 else
2583 return TypeSize::getScalable(0);
2584 }
2585 llvm_unreachable("Unsupported register kind");
2586}
2587
2588bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2590 Type *SrcOverrideTy) {
2591 // A helper that returns a vector type from the given type. The number of
2592 // elements in type Ty determines the vector width.
2593 auto toVectorTy = [&](Type *ArgTy) {
2594 return VectorType::get(ArgTy->getScalarType(),
2595 cast<VectorType>(DstTy)->getElementCount());
2596 };
2597
2598 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2599 // i32, i64]. SVE doesn't generally have the same set of instructions to
2600 // perform an extend with the add/sub/mul. There are SMULLB style
2601 // instructions, but they operate on top/bottom, requiring some sort of lane
2602 // interleaving to be used with zext/sext.
2603 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2604 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2605 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2606 return false;
2607
2608 // Determine if the operation has a widening variant. We consider both the
2609 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2610 // instructions.
2611 //
2612 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2613 // verify that their extending operands are eliminated during code
2614 // generation.
2615 Type *SrcTy = SrcOverrideTy;
2616 switch (Opcode) {
2617 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2618 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2619 // The second operand needs to be an extend
2620 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2621 if (!SrcTy)
2622 SrcTy =
2623 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2624 } else
2625 return false;
2626 break;
2627 case Instruction::Mul: { // SMULL(2), UMULL(2)
2628 // Both operands need to be extends of the same type.
2629 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2630 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2631 if (!SrcTy)
2632 SrcTy =
2633 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2634 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2635 // If one of the operands is a Zext and the other has enough zero bits to
2636 // be treated as unsigned, we can still general a umull, meaning the zext
2637 // is free.
2638 KnownBits Known =
2639 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2640 if (Args[0]->getType()->getScalarSizeInBits() -
2641 Known.Zero.countLeadingOnes() >
2642 DstTy->getScalarSizeInBits() / 2)
2643 return false;
2644 if (!SrcTy)
2645 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2646 DstTy->getScalarSizeInBits() / 2));
2647 } else
2648 return false;
2649 break;
2650 }
2651 default:
2652 return false;
2653 }
2654
2655 // Legalize the destination type and ensure it can be used in a widening
2656 // operation.
2657 auto DstTyL = getTypeLegalizationCost(DstTy);
2658 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2659 return false;
2660
2661 // Legalize the source type and ensure it can be used in a widening
2662 // operation.
2663 assert(SrcTy && "Expected some SrcTy");
2664 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2665 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2666 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2667 return false;
2668
2669 // Get the total number of vector elements in the legalized types.
2670 InstructionCost NumDstEls =
2671 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2672 InstructionCost NumSrcEls =
2673 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2674
2675 // Return true if the legalized types have the same number of vector elements
2676 // and the destination element type size is twice that of the source type.
2677 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2678}
2679
2680// s/urhadd instructions implement the following pattern, making the
2681// extends free:
2682// %x = add ((zext i8 -> i16), 1)
2683// %y = (zext i8 -> i16)
2684// trunc i16 (lshr (add %x, %y), 1) -> i8
2685//
2687 Type *Src) {
2688 // The source should be a legal vector type.
2689 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2690 (Src->isScalableTy() && !ST->hasSVE2()))
2691 return false;
2692
2693 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2694 return false;
2695
2696 // Look for trunc/shl/add before trying to match the pattern.
2697 const Instruction *Add = ExtUser;
2698 auto *AddUser =
2699 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2700 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2701 Add = AddUser;
2702
2703 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2704 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2705 return false;
2706
2707 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2708 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2709 Src->getScalarSizeInBits() !=
2710 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2711 return false;
2712
2713 // Try to match the whole pattern. Ext could be either the first or second
2714 // m_ZExtOrSExt matched.
2715 Instruction *Ex1, *Ex2;
2716 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2717 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2718 return false;
2719
2720 // Ensure both extends are of the same type
2721 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2722 Ex1->getOpcode() == Ex2->getOpcode())
2723 return true;
2724
2725 return false;
2726}
2727
2729 Type *Src,
2732 const Instruction *I) {
2733 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2734 assert(ISD && "Invalid opcode");
2735 // If the cast is observable, and it is used by a widening instruction (e.g.,
2736 // uaddl, saddw, etc.), it may be free.
2737 if (I && I->hasOneUser()) {
2738 auto *SingleUser = cast<Instruction>(*I->user_begin());
2739 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2740 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2741 // For adds only count the second operand as free if both operands are
2742 // extends but not the same operation. (i.e both operands are not free in
2743 // add(sext, zext)).
2744 if (SingleUser->getOpcode() == Instruction::Add) {
2745 if (I == SingleUser->getOperand(1) ||
2746 (isa<CastInst>(SingleUser->getOperand(1)) &&
2747 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2748 return 0;
2749 } else // Others are free so long as isWideningInstruction returned true.
2750 return 0;
2751 }
2752
2753 // The cast will be free for the s/urhadd instructions
2754 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2755 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2756 return 0;
2757 }
2758
2759 // TODO: Allow non-throughput costs that aren't binary.
2760 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2762 return Cost == 0 ? 0 : 1;
2763 return Cost;
2764 };
2765
2766 EVT SrcTy = TLI->getValueType(DL, Src);
2767 EVT DstTy = TLI->getValueType(DL, Dst);
2768
2769 if (!SrcTy.isSimple() || !DstTy.isSimple())
2770 return AdjustCost(
2771 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2772
2773 static const TypeConversionCostTblEntry BF16Tbl[] = {
2774 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
2775 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
2776 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
2777 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
2778 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
2779 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
2780 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
2781 };
2782
2783 if (ST->hasBF16())
2784 if (const auto *Entry = ConvertCostTableLookup(
2785 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2786 return AdjustCost(Entry->Cost);
2787
2788 static const TypeConversionCostTblEntry ConversionTbl[] = {
2789 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2790 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2791 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2792 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2793 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2794 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2795 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2796 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2797 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2798 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2799 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2800 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2801 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2802 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2803 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2804 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2805 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2806 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2807 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2808 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2809
2810 // Truncations on nxvmiN
2811 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
2812 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
2813 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
2814 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
2815 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
2816 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
2817 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
2818 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
2819 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
2820 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
2821 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
2822 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
2823 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
2824 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
2825 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
2826 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
2827 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
2828 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
2829 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
2830 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
2831 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
2832 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
2833 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
2834 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
2835 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
2836 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
2837 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
2838 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
2839 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
2840 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
2841 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
2842 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
2843 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
2844
2845 // The number of shll instructions for the extension.
2846 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2847 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2848 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2849 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2850 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2851 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2852 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2853 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2854 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2855 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2856 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2857 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2858 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2859 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2860 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2861 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2862
2863 // FP Ext and trunc
2864 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
2865 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
2866 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
2867 // FP16
2868 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
2869 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
2870 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
2871 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
2872 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
2873 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
2874 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
2875 // BF16 (uses shift)
2876 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
2877 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
2878 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
2879 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
2880 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
2881 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
2882 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
2883 // FP Ext and trunc
2884 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
2885 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
2886 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
2887 // FP16
2888 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
2889 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
2890 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
2891 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
2892 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
2893 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
2894 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
2895 // BF16 (more complex, with +bf16 is handled above)
2896 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
2897 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
2898 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
2899 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
2900 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
2901 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
2902 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
2903 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
2904
2905 // LowerVectorINT_TO_FP:
2906 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2907 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2908 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2909 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2910 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2911 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2912
2913 // Complex: to v2f32
2914 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2915 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2916 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2917 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2918 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2919 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2920
2921 // Complex: to v4f32
2922 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
2923 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2924 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
2925 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2926
2927 // Complex: to v8f32
2928 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2929 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2930 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2931 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2932
2933 // Complex: to v16f32
2934 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2935 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2936
2937 // Complex: to v2f64
2938 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2939 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2940 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2941 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2942 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2943 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2944
2945 // Complex: to v4f64
2946 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2947 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2948
2949 // LowerVectorFP_TO_INT
2950 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
2951 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
2952 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
2953 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
2954 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
2955 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
2956
2957 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2958 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
2959 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
2960 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
2961 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
2962 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
2963 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
2964
2965 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2966 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
2967 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
2968 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
2969 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
2970
2971 // Complex, from nxv2f32.
2972 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2973 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2974 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2975 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2976 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2977 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2978 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2979 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2980
2981 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2982 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
2983 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
2984 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
2985 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
2986 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
2987 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
2988
2989 // Complex, from nxv2f64.
2990 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2991 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2992 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2993 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2994 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2995 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2996 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2997 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2998
2999 // Complex, from nxv4f32.
3000 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3001 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3002 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3003 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3004 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3005 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3006 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3007 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3008
3009 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3010 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3011 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3012 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3013 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3014
3015 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3016 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3017 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3018 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3019 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3020 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3021 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3022
3023 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3024 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3025 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3026 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3027 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3028
3029 // Complex, from nxv8f16.
3030 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3031 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3032 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3033 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3034 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3035 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3036 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3037 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3038
3039 // Complex, from nxv4f16.
3040 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3041 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3042 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3043 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3044 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3045 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3046 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3047 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3048
3049 // Complex, from nxv2f16.
3050 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3051 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3052 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3053 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3054 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3055 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3056 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3057 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3058
3059 // Truncate from nxvmf32 to nxvmf16.
3060 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3061 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3062 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3063
3064 // Truncate from nxvmf64 to nxvmf16.
3065 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3066 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3067 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3068
3069 // Truncate from nxvmf64 to nxvmf32.
3070 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3071 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3072 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3073
3074 // Extend from nxvmf16 to nxvmf32.
3075 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3076 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3077 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3078
3079 // Extend from nxvmf16 to nxvmf64.
3080 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3081 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3082 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3083
3084 // Extend from nxvmf32 to nxvmf64.
3085 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3086 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3087 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3088
3089 // Bitcasts from float to integer
3090 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3091 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3092 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3093
3094 // Bitcasts from integer to float
3095 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3096 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3097 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3098
3099 // Add cost for extending to illegal -too wide- scalable vectors.
3100 // zero/sign extend are implemented by multiple unpack operations,
3101 // where each operation has a cost of 1.
3102 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3103 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3104 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3105 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3106 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3107 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3108
3109 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3110 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3111 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3112 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3113 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3114 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3115 };
3116
3117 // We have to estimate a cost of fixed length operation upon
3118 // SVE registers(operations) with the number of registers required
3119 // for a fixed type to be represented upon SVE registers.
3120 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3121 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3122 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3123 ST->useSVEForFixedLengthVectors(WiderTy)) {
3124 std::pair<InstructionCost, MVT> LT =
3125 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3126 unsigned NumElements =
3127 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3128 return AdjustCost(
3129 LT.first *
3131 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3132 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3133 CostKind, I));
3134 }
3135
3136 if (const auto *Entry = ConvertCostTableLookup(
3137 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3138 return AdjustCost(Entry->Cost);
3139
3140 static const TypeConversionCostTblEntry FP16Tbl[] = {
3141 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3142 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3143 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3144 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3145 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3146 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3147 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3148 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3149 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3150 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3151 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3152 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3153 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3154 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3155 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3156 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3157 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3158 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3159 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3160 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3161 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3162 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3163 };
3164
3165 if (ST->hasFullFP16())
3166 if (const auto *Entry = ConvertCostTableLookup(
3167 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3168 return AdjustCost(Entry->Cost);
3169
3170 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3173 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3175 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3177 // The standard behaviour in the backend for these cases is to split the
3178 // extend up into two parts:
3179 // 1. Perform an extending load or masked load up to the legal type.
3180 // 2. Extend the loaded data to the final type.
3181 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3182 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3184 Opcode, LegalTy, Src, CCH, CostKind, I);
3186 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3187 return Part1 + Part2;
3188 }
3189
3190 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3191 // but we also want to include the TTI::CastContextHint::Masked case too.
3192 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3194 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3196
3197 return AdjustCost(
3198 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3199}
3200
3202 Type *Dst,
3203 VectorType *VecTy,
3204 unsigned Index) {
3205
3206 // Make sure we were given a valid extend opcode.
3207 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3208 "Invalid opcode");
3209
3210 // We are extending an element we extract from a vector, so the source type
3211 // of the extend is the element type of the vector.
3212 auto *Src = VecTy->getElementType();
3213
3214 // Sign- and zero-extends are for integer types only.
3215 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3216
3217 // Get the cost for the extract. We compute the cost (if any) for the extend
3218 // below.
3220 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3221 CostKind, Index, nullptr, nullptr);
3222
3223 // Legalize the types.
3224 auto VecLT = getTypeLegalizationCost(VecTy);
3225 auto DstVT = TLI->getValueType(DL, Dst);
3226 auto SrcVT = TLI->getValueType(DL, Src);
3227
3228 // If the resulting type is still a vector and the destination type is legal,
3229 // we may get the extension for free. If not, get the default cost for the
3230 // extend.
3231 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3232 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3233 CostKind);
3234
3235 // The destination type should be larger than the element type. If not, get
3236 // the default cost for the extend.
3237 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3238 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3239 CostKind);
3240
3241 switch (Opcode) {
3242 default:
3243 llvm_unreachable("Opcode should be either SExt or ZExt");
3244
3245 // For sign-extends, we only need a smov, which performs the extension
3246 // automatically.
3247 case Instruction::SExt:
3248 return Cost;
3249
3250 // For zero-extends, the extend is performed automatically by a umov unless
3251 // the destination type is i64 and the element type is i8 or i16.
3252 case Instruction::ZExt:
3253 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3254 return Cost;
3255 }
3256
3257 // If we are unable to perform the extend for free, get the default cost.
3258 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3259 CostKind);
3260}
3261
3264 const Instruction *I) {
3266 return Opcode == Instruction::PHI ? 0 : 1;
3267 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3268 // Branches are assumed to be predicted.
3269 return 0;
3270}
3271
3272InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3273 unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3274 const Instruction *I, Value *Scalar,
3275 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3276 assert(Val->isVectorTy() && "This must be a vector type");
3277
3278 if (Index != -1U) {
3279 // Legalize the type.
3280 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3281
3282 // This type is legalized to a scalar type.
3283 if (!LT.second.isVector())
3284 return 0;
3285
3286 // The type may be split. For fixed-width vectors we can normalize the
3287 // index to the new type.
3288 if (LT.second.isFixedLengthVector()) {
3289 unsigned Width = LT.second.getVectorNumElements();
3290 Index = Index % Width;
3291 }
3292
3293 // The element at index zero is already inside the vector.
3294 // - For a physical (HasRealUse==true) insert-element or extract-element
3295 // instruction that extracts integers, an explicit FPR -> GPR move is
3296 // needed. So it has non-zero cost.
3297 // - For the rest of cases (virtual instruction or element type is float),
3298 // consider the instruction free.
3299 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3300 return 0;
3301
3302 // This is recognising a LD1 single-element structure to one lane of one
3303 // register instruction. I.e., if this is an `insertelement` instruction,
3304 // and its second operand is a load, then we will generate a LD1, which
3305 // are expensive instructions.
3306 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3307 return ST->getVectorInsertExtractBaseCost() + 1;
3308
3309 // i1 inserts and extract will include an extra cset or cmp of the vector
3310 // value. Increase the cost by 1 to account.
3311 if (Val->getScalarSizeInBits() == 1)
3312 return ST->getVectorInsertExtractBaseCost() + 1;
3313
3314 // FIXME:
3315 // If the extract-element and insert-element instructions could be
3316 // simplified away (e.g., could be combined into users by looking at use-def
3317 // context), they have no cost. This is not done in the first place for
3318 // compile-time considerations.
3319 }
3320
3321 // In case of Neon, if there exists extractelement from lane != 0 such that
3322 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3323 // 2. extractelement result feeds into fmul.
3324 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3325 // equivalent to 0.
3326 // then the extractelement can be merged with fmul in the backend and it
3327 // incurs no cost.
3328 // e.g.
3329 // define double @foo(<2 x double> %a) {
3330 // %1 = extractelement <2 x double> %a, i32 0
3331 // %2 = extractelement <2 x double> %a, i32 1
3332 // %res = fmul double %1, %2
3333 // ret double %res
3334 // }
3335 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3336 auto ExtractCanFuseWithFmul = [&]() {
3337 // We bail out if the extract is from lane 0.
3338 if (Index == 0)
3339 return false;
3340
3341 // Check if the scalar element type of the vector operand of ExtractElement
3342 // instruction is one of the allowed types.
3343 auto IsAllowedScalarTy = [&](const Type *T) {
3344 return T->isFloatTy() || T->isDoubleTy() ||
3345 (T->isHalfTy() && ST->hasFullFP16());
3346 };
3347
3348 // Check if the extractelement user is scalar fmul.
3349 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3350 // Check if the user is scalar fmul.
3351 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3352 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3353 !BO->getType()->isVectorTy();
3354 };
3355
3356 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3357 // certain scalar type and a certain vector register width.
3358 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3359 auto RegWidth =
3361 .getFixedValue();
3362 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3363 };
3364
3365 // Check if the type constraints on input vector type and result scalar type
3366 // of extractelement instruction are satisfied.
3367 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3368 return false;
3369
3370 if (Scalar) {
3371 DenseMap<User *, unsigned> UserToExtractIdx;
3372 for (auto *U : Scalar->users()) {
3373 if (!IsUserFMulScalarTy(U))
3374 return false;
3375 // Recording entry for the user is important. Index value is not
3376 // important.
3377 UserToExtractIdx[U];
3378 }
3379 if (UserToExtractIdx.empty())
3380 return false;
3381 for (auto &[S, U, L] : ScalarUserAndIdx) {
3382 for (auto *U : S->users()) {
3383 if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3384 auto *FMul = cast<BinaryOperator>(U);
3385 auto *Op0 = FMul->getOperand(0);
3386 auto *Op1 = FMul->getOperand(1);
3387 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3388 UserToExtractIdx[U] = L;
3389 break;
3390 }
3391 }
3392 }
3393 }
3394 for (auto &[U, L] : UserToExtractIdx) {
3395 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3396 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3397 return false;
3398 }
3399 } else {
3400 const auto *EE = cast<ExtractElementInst>(I);
3401
3402 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3403 if (!IdxOp)
3404 return false;
3405
3406 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3407 if (!IsUserFMulScalarTy(U))
3408 return false;
3409
3410 // Check if the other operand of extractelement is also extractelement
3411 // from lane equivalent to 0.
3412 const auto *BO = cast<BinaryOperator>(U);
3413 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3414 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3415 if (OtherEE) {
3416 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3417 if (!IdxOp)
3418 return false;
3419 return IsExtractLaneEquivalentToZero(
3420 cast<ConstantInt>(OtherEE->getIndexOperand())
3421 ->getValue()
3422 .getZExtValue(),
3423 OtherEE->getType()->getScalarSizeInBits());
3424 }
3425 return true;
3426 });
3427 }
3428 return true;
3429 };
3430
3431 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3432 ExtractCanFuseWithFmul())
3433 return 0;
3434
3435 // All other insert/extracts cost this much.
3436 return ST->getVectorInsertExtractBaseCost();
3437}
3438
3441 unsigned Index, Value *Op0,
3442 Value *Op1) {
3443 bool HasRealUse =
3444 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3445 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3446}
3447
3449 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3450 Value *Scalar,
3451 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3452 return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3453 ScalarUserAndIdx);
3454}
3455
3457 Type *Val,
3459 unsigned Index) {
3460 return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3461 true /* HasRealUse */, &I);
3462}
3463
3465 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3467 if (isa<ScalableVectorType>(Ty))
3469 if (Ty->getElementType()->isFloatingPointTy())
3470 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3471 CostKind);
3472 return DemandedElts.popcount() * (Insert + Extract) *
3474}
3475
3477 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3480 const Instruction *CxtI) {
3481
3482 // The code-generator is currently not able to handle scalable vectors
3483 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3484 // it. This change will be removed when code-generation for these types is
3485 // sufficiently reliable.
3486 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3487 if (VTy->getElementCount() == ElementCount::getScalable(1))
3489
3490 // TODO: Handle more cost kinds.
3492 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3493 Op2Info, Args, CxtI);
3494
3495 // Legalize the type.
3496 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3497 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3498
3499 switch (ISD) {
3500 default:
3501 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3502 Op2Info);
3503 case ISD::SDIV:
3504 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3505 // On AArch64, scalar signed division by constants power-of-two are
3506 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3507 // The OperandValue properties many not be same as that of previous
3508 // operation; conservatively assume OP_None.
3510 Instruction::Add, Ty, CostKind,
3511 Op1Info.getNoProps(), Op2Info.getNoProps());
3512 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3513 Op1Info.getNoProps(), Op2Info.getNoProps());
3515 Instruction::Select, Ty, CostKind,
3516 Op1Info.getNoProps(), Op2Info.getNoProps());
3517 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3518 Op1Info.getNoProps(), Op2Info.getNoProps());
3519 return Cost;
3520 }
3521 [[fallthrough]];
3522 case ISD::UDIV: {
3523 auto VT = TLI->getValueType(DL, Ty);
3524 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3525 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3526 // Vector signed division by constant are expanded to the
3527 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3528 // to MULHS + SUB + SRL + ADD + SRL.
3530 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3532 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3534 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3535 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3536 }
3537 }
3538
3539 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
3540 // emitted by the backend even when those functions are not declared in the
3541 // module.
3542 if (!VT.isVector() && VT.getSizeInBits() > 64)
3543 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3544
3546 Opcode, Ty, CostKind, Op1Info, Op2Info);
3547 if (Ty->isVectorTy()) {
3548 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3549 // SDIV/UDIV operations are lowered using SVE, then we can have less
3550 // costs.
3551 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3552 ->getPrimitiveSizeInBits()
3553 .getFixedValue() < 128) {
3554 EVT VT = TLI->getValueType(DL, Ty);
3555 static const CostTblEntry DivTbl[]{
3556 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3557 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3558 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3559 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3560 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3561 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3562
3563 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3564 if (nullptr != Entry)
3565 return Entry->Cost;
3566 }
3567 // For 8/16-bit elements, the cost is higher because the type
3568 // requires promotion and possibly splitting:
3569 if (LT.second.getScalarType() == MVT::i8)
3570 Cost *= 8;
3571 else if (LT.second.getScalarType() == MVT::i16)
3572 Cost *= 4;
3573 return Cost;
3574 } else {
3575 // If one of the operands is a uniform constant then the cost for each
3576 // element is Cost for insertion, extraction and division.
3577 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3578 // operation with scalar type
3579 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3580 (Op2Info.isConstant() && Op2Info.isUniform())) {
3581 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3583 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3584 return (4 + DivCost) * VTy->getNumElements();
3585 }
3586 }
3587 // On AArch64, without SVE, vector divisions are expanded
3588 // into scalar divisions of each pair of elements.
3589 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3590 CostKind, Op1Info, Op2Info);
3591 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3592 Op1Info, Op2Info);
3593 }
3594
3595 // TODO: if one of the arguments is scalar, then it's not necessary to
3596 // double the cost of handling the vector elements.
3597 Cost += Cost;
3598 }
3599 return Cost;
3600 }
3601 case ISD::MUL:
3602 // When SVE is available, then we can lower the v2i64 operation using
3603 // the SVE mul instruction, which has a lower cost.
3604 if (LT.second == MVT::v2i64 && ST->hasSVE())
3605 return LT.first;
3606
3607 // When SVE is not available, there is no MUL.2d instruction,
3608 // which means mul <2 x i64> is expensive as elements are extracted
3609 // from the vectors and the muls scalarized.
3610 // As getScalarizationOverhead is a bit too pessimistic, we
3611 // estimate the cost for a i64 vector directly here, which is:
3612 // - four 2-cost i64 extracts,
3613 // - two 2-cost i64 inserts, and
3614 // - two 1-cost muls.
3615 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3616 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3617 // need to scalarize so the cost can be cheaper (smull or umull).
3618 // so the cost can be cheaper (smull or umull).
3619 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3620 return LT.first;
3621 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
3623 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
3624 nullptr, nullptr) *
3625 2 +
3626 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
3627 nullptr, nullptr));
3628 case ISD::ADD:
3629 case ISD::XOR:
3630 case ISD::OR:
3631 case ISD::AND:
3632 case ISD::SRL:
3633 case ISD::SRA:
3634 case ISD::SHL:
3635 // These nodes are marked as 'custom' for combining purposes only.
3636 // We know that they are legal. See LowerAdd in ISelLowering.
3637 return LT.first;
3638
3639 case ISD::FNEG:
3640 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3641 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3642 (Ty->isHalfTy() && ST->hasFullFP16())) &&
3643 CxtI &&
3644 ((CxtI->hasOneUse() &&
3645 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3646 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3647 return 0;
3648 [[fallthrough]];
3649 case ISD::FADD:
3650 case ISD::FSUB:
3651 // Increase the cost for half and bfloat types if not architecturally
3652 // supported.
3653 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3654 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3655 return 2 * LT.first;
3656 if (!Ty->getScalarType()->isFP128Ty())
3657 return LT.first;
3658 [[fallthrough]];
3659 case ISD::FMUL:
3660 case ISD::FDIV:
3661 // These nodes are marked as 'custom' just to lower them to SVE.
3662 // We know said lowering will incur no additional cost.
3663 if (!Ty->getScalarType()->isFP128Ty())
3664 return 2 * LT.first;
3665
3666 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3667 Op2Info);
3668 case ISD::FREM:
3669 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3670 // those functions are not declared in the module.
3671 if (!Ty->isVectorTy())
3672 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3673 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3674 Op2Info);
3675 }
3676}
3677
3679 ScalarEvolution *SE,
3680 const SCEV *Ptr) {
3681 // Address computations in vectorized code with non-consecutive addresses will
3682 // likely result in more instructions compared to scalar code where the
3683 // computation can more often be merged into the index mode. The resulting
3684 // extra micro-ops can significantly decrease throughput.
3685 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3686 int MaxMergeDistance = 64;
3687
3688 if (Ty->isVectorTy() && SE &&
3689 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3690 return NumVectorInstToHideOverhead;
3691
3692 // In many cases the address computation is not merged into the instruction
3693 // addressing mode.
3694 return 1;
3695}
3696
3698 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3700 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3701 // TODO: Handle other cost kinds.
3703 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3704 Op1Info, Op2Info, I);
3705
3706 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3707 // We don't lower some vector selects well that are wider than the register
3708 // width.
3709 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3710 // We would need this many instructions to hide the scalarization happening.
3711 const int AmortizationCost = 20;
3712
3713 // If VecPred is not set, check if we can get a predicate from the context
3714 // instruction, if its type matches the requested ValTy.
3715 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3716 CmpPredicate CurrentPred;
3717 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3718 m_Value())))
3719 VecPred = CurrentPred;
3720 }
3721 // Check if we have a compare/select chain that can be lowered using
3722 // a (F)CMxx & BFI pair.
3723 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3724 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3725 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3726 VecPred == CmpInst::FCMP_UNE) {
3727 static const auto ValidMinMaxTys = {
3728 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3729 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3730 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3731
3732 auto LT = getTypeLegalizationCost(ValTy);
3733 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3734 (ST->hasFullFP16() &&
3735 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3736 return LT.first;
3737 }
3738
3739 static const TypeConversionCostTblEntry
3740 VectorSelectTbl[] = {
3741 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3742 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3743 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3744 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3745 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3746 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3747 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3748 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3749 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3750 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3751 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3752 };
3753
3754 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3755 EVT SelValTy = TLI->getValueType(DL, ValTy);
3756 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3757 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3758 SelCondTy.getSimpleVT(),
3759 SelValTy.getSimpleVT()))
3760 return Entry->Cost;
3761 }
3762 }
3763
3764 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3765 auto LT = getTypeLegalizationCost(ValTy);
3766 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3767 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3768 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3769 }
3770
3771 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3772 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3773 // be profitable.
3774 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3775 ICmpInst::isEquality(VecPred) &&
3776 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3777 match(I->getOperand(1), m_Zero()) &&
3778 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3779 return 0;
3780
3781 // The base case handles scalable vectors fine for now, since it treats the
3782 // cost as 1 * legalization cost.
3783 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3784 Op1Info, Op2Info, I);
3785}
3786
3788AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3790 if (ST->requiresStrictAlign()) {
3791 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3792 // a bunch of instructions when strict align is enabled.
3793 return Options;
3794 }
3795 Options.AllowOverlappingLoads = true;
3796 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3797 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3798 // TODO: Though vector loads usually perform well on AArch64, in some targets
3799 // they may wake up the FP unit, which raises the power consumption. Perhaps
3800 // they could be used with no holds barred (-O3).
3801 Options.LoadSizes = {8, 4, 2, 1};
3802 Options.AllowedTailExpansions = {3, 5, 6};
3803 return Options;
3804}
3805
3807 return ST->hasSVE();
3808}
3809
3812 Align Alignment, unsigned AddressSpace,
3814 if (useNeonVector(Src))
3815 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3816 CostKind);
3817 auto LT = getTypeLegalizationCost(Src);
3818 if (!LT.first.isValid())
3820
3821 // Return an invalid cost for element types that we are unable to lower.
3822 auto *VT = cast<VectorType>(Src);
3823 if (VT->getElementType()->isIntegerTy(1))
3825
3826 // The code-generator is currently not able to handle scalable vectors
3827 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3828 // it. This change will be removed when code-generation for these types is
3829 // sufficiently reliable.
3830 if (VT->getElementCount() == ElementCount::getScalable(1))
3832
3833 return LT.first;
3834}
3835
3836// This function returns gather/scatter overhead either from
3837// user-provided value or specialized values per-target from \p ST.
3838static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3839 const AArch64Subtarget *ST) {
3840 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3841 "Should be called on only load or stores.");
3842 switch (Opcode) {
3843 case Instruction::Load:
3844 if (SVEGatherOverhead.getNumOccurrences() > 0)
3845 return SVEGatherOverhead;
3846 return ST->getGatherOverhead();
3847 break;
3848 case Instruction::Store:
3849 if (SVEScatterOverhead.getNumOccurrences() > 0)
3850 return SVEScatterOverhead;
3851 return ST->getScatterOverhead();
3852 break;
3853 default:
3854 llvm_unreachable("Shouldn't have reached here");
3855 }
3856}
3857
3859 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3860 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3861 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3862 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3863 Alignment, CostKind, I);
3864 auto *VT = cast<VectorType>(DataTy);
3865 auto LT = getTypeLegalizationCost(DataTy);
3866 if (!LT.first.isValid())
3868
3869 // Return an invalid cost for element types that we are unable to lower.
3870 if (!LT.second.isVector() ||
3871 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3872 VT->getElementType()->isIntegerTy(1))
3874
3875 // The code-generator is currently not able to handle scalable vectors
3876 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3877 // it. This change will be removed when code-generation for these types is
3878 // sufficiently reliable.
3879 if (VT->getElementCount() == ElementCount::getScalable(1))
3881
3882 ElementCount LegalVF = LT.second.getVectorElementCount();
3883 InstructionCost MemOpCost =
3884 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3885 {TTI::OK_AnyValue, TTI::OP_None}, I);
3886 // Add on an overhead cost for using gathers/scatters.
3887 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3888 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3889}
3890
3892 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3893}
3894
3896 MaybeAlign Alignment,
3897 unsigned AddressSpace,
3899 TTI::OperandValueInfo OpInfo,
3900 const Instruction *I) {
3901 EVT VT = TLI->getValueType(DL, Ty, true);
3902 // Type legalization can't handle structs
3903 if (VT == MVT::Other)
3904 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3905 CostKind);
3906
3907 auto LT = getTypeLegalizationCost(Ty);
3908 if (!LT.first.isValid())
3910
3911 // The code-generator is currently not able to handle scalable vectors
3912 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3913 // it. This change will be removed when code-generation for these types is
3914 // sufficiently reliable.
3915 // We also only support full register predicate loads and stores.
3916 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3917 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3918 (VTy->getElementType()->isIntegerTy(1) &&
3919 !VTy->getElementCount().isKnownMultipleOf(
3922
3923 // TODO: consider latency as well for TCK_SizeAndLatency.
3925 return LT.first;
3926
3928 return 1;
3929
3930 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3931 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3932 // Unaligned stores are extremely inefficient. We don't split all
3933 // unaligned 128-bit stores because the negative impact that has shown in
3934 // practice on inlined block copy code.
3935 // We make such stores expensive so that we will only vectorize if there
3936 // are 6 other instructions getting vectorized.
3937 const int AmortizationCost = 6;
3938
3939 return LT.first * 2 * AmortizationCost;
3940 }
3941
3942 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3943 if (Ty->isPtrOrPtrVectorTy())
3944 return LT.first;
3945
3946 if (useNeonVector(Ty)) {
3947 // Check truncating stores and extending loads.
3948 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3949 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3950 if (VT == MVT::v4i8)
3951 return 2;
3952 // Otherwise we need to scalarize.
3953 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3954 }
3955 EVT EltVT = VT.getVectorElementType();
3956 unsigned EltSize = EltVT.getScalarSizeInBits();
3957 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3958 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3959 *Alignment != Align(1))
3960 return LT.first;
3961 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3962 // widening to v4i8, which produces suboptimal results.
3963 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3964 return LT.first;
3965
3966 // Check non-power-of-2 loads/stores for legal vector element types with
3967 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3968 // operations on smaller power-of-2 ops, including ld1/st1.
3969 LLVMContext &C = Ty->getContext();
3971 SmallVector<EVT> TypeWorklist;
3972 TypeWorklist.push_back(VT);
3973 while (!TypeWorklist.empty()) {
3974 EVT CurrVT = TypeWorklist.pop_back_val();
3975 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3976 if (isPowerOf2_32(CurrNumElements)) {
3977 Cost += 1;
3978 continue;
3979 }
3980
3981 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3982 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3983 TypeWorklist.push_back(
3984 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3985 }
3986 return Cost;
3987 }
3988
3989 return LT.first;
3990}
3991
3993 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3994 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3995 bool UseMaskForCond, bool UseMaskForGaps) {
3996 assert(Factor >= 2 && "Invalid interleave factor");
3997 auto *VecVTy = cast<VectorType>(VecTy);
3998
3999 if (VecTy->isScalableTy() && !ST->hasSVE())
4001
4002 // Vectorization for masked interleaved accesses is only enabled for scalable
4003 // VF.
4004 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4006
4007 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4008 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4009 auto *SubVecTy =
4010 VectorType::get(VecVTy->getElementType(),
4011 VecVTy->getElementCount().divideCoefficientBy(Factor));
4012
4013 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4014 // Accesses having vector types that are a multiple of 128 bits can be
4015 // matched to more than one ldN/stN instruction.
4016 bool UseScalable;
4017 if (MinElts % Factor == 0 &&
4018 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4019 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4020 }
4021
4022 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4023 Alignment, AddressSpace, CostKind,
4024 UseMaskForCond, UseMaskForGaps);
4025}
4026
4031 for (auto *I : Tys) {
4032 if (!I->isVectorTy())
4033 continue;
4034 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4035 128)
4036 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4037 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4038 }
4039 return Cost;
4040}
4041
4043 return ST->getMaxInterleaveFactor();
4044}
4045
4046// For Falkor, we want to avoid having too many strided loads in a loop since
4047// that can exhaust the HW prefetcher resources. We adjust the unroller
4048// MaxCount preference below to attempt to ensure unrolling doesn't create too
4049// many strided loads.
4050static void
4053 enum { MaxStridedLoads = 7 };
4054 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4055 int StridedLoads = 0;
4056 // FIXME? We could make this more precise by looking at the CFG and
4057 // e.g. not counting loads in each side of an if-then-else diamond.
4058 for (const auto BB : L->blocks()) {
4059 for (auto &I : *BB) {
4060 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4061 if (!LMemI)
4062 continue;
4063
4064 Value *PtrValue = LMemI->getPointerOperand();
4065 if (L->isLoopInvariant(PtrValue))
4066 continue;
4067
4068 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4069 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4070 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4071 continue;
4072
4073 // FIXME? We could take pairing of unrolled load copies into account
4074 // by looking at the AddRec, but we would probably have to limit this
4075 // to loops with no stores or other memory optimization barriers.
4076 ++StridedLoads;
4077 // We've seen enough strided loads that seeing more won't make a
4078 // difference.
4079 if (StridedLoads > MaxStridedLoads / 2)
4080 return StridedLoads;
4081 }
4082 }
4083 return StridedLoads;
4084 };
4085
4086 int StridedLoads = countStridedLoads(L, SE);
4087 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4088 << " strided loads\n");
4089 // Pick the largest power of 2 unroll count that won't result in too many
4090 // strided loads.
4091 if (StridedLoads) {
4092 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4093 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4094 << UP.MaxCount << '\n');
4095 }
4096}
4097
4098/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4099/// OOO engine's wide instruction window and various predictors.
4100static void
4104 // Limit loops with structure that is highly likely to benefit from runtime
4105 // unrolling; that is we exclude outer loops, loops with multiple exits and
4106 // many blocks (i.e. likely with complex control flow). Note that the
4107 // heuristics here may be overly conservative and we err on the side of
4108 // avoiding runtime unrolling rather than unroll excessively. They are all
4109 // subject to further refinement.
4110 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4111 return;
4112
4113 const SCEV *BTC = SE.getBackedgeTakenCount(L);
4114 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4115 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4116 SE.getSmallConstantMaxTripCount(L) <= 32))
4117 return;
4118 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4119 return;
4120
4121 int64_t Size = 0;
4122 for (auto *BB : L->getBlocks()) {
4123 for (auto &I : *BB) {
4124 if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4125 return;
4126 SmallVector<const Value *, 4> Operands(I.operand_values());
4127 Size +=
4129 }
4130 }
4131
4132 // Limit to loops with trip counts that are cheap to expand.
4133 UP.SCEVExpansionBudget = 1;
4134
4135 // Try to unroll small, single block loops, if they have load/store
4136 // dependencies, to expose more parallel memory access streams.
4137 BasicBlock *Header = L->getHeader();
4138 if (Header == L->getLoopLatch()) {
4139 if (Size > 8)
4140 return;
4141
4142 SmallPtrSet<Value *, 8> LoadedValues;
4144 for (auto *BB : L->blocks()) {
4145 for (auto &I : *BB) {
4147 if (!Ptr)
4148 continue;
4149 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4150 if (SE.isLoopInvariant(PtrSCEV, L))
4151 continue;
4152 if (isa<LoadInst>(&I))
4153 LoadedValues.insert(&I);
4154 else
4155 Stores.push_back(cast<StoreInst>(&I));
4156 }
4157 }
4158
4159 // Try to find an unroll count that maximizes the use of the instruction
4160 // window, i.e. trying to fetch as many instructions per cycle as possible.
4161 unsigned MaxInstsPerLine = 16;
4162 unsigned UC = 1;
4163 unsigned BestUC = 1;
4164 unsigned SizeWithBestUC = BestUC * Size;
4165 while (UC <= 8) {
4166 unsigned SizeWithUC = UC * Size;
4167 if (SizeWithUC > 48)
4168 break;
4169 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4170 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4171 BestUC = UC;
4172 SizeWithBestUC = BestUC * Size;
4173 }
4174 UC++;
4175 }
4176
4177 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4178 return LoadedValues.contains(SI->getOperand(0));
4179 }))
4180 return;
4181
4182 UP.Runtime = true;
4183 UP.DefaultUnrollRuntimeCount = BestUC;
4184 return;
4185 }
4186
4187 // Try to runtime-unroll loops with early-continues depending on loop-varying
4188 // loads; this helps with branch-prediction for the early-continues.
4189 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4190 auto *Latch = L->getLoopLatch();
4192 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4193 none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4194 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
4195 return;
4196
4197 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4198 [&](Instruction *I, unsigned Depth) -> bool {
4199 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4200 return false;
4201
4202 if (isa<LoadInst>(I))
4203 return true;
4204
4205 return any_of(I->operands(), [&](Value *V) {
4206 auto *I = dyn_cast<Instruction>(V);
4207 return I && DependsOnLoopLoad(I, Depth + 1);
4208 });
4209 };
4210 CmpPredicate Pred;
4211 Instruction *I;
4212 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4213 m_Value())) &&
4214 DependsOnLoopLoad(I, 0)) {
4215 UP.Runtime = true;
4216 }
4217}
4218
4222 // Enable partial unrolling and runtime unrolling.
4223 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4224
4225 UP.UpperBound = true;
4226
4227 // For inner loop, it is more likely to be a hot one, and the runtime check
4228 // can be promoted out from LICM pass, so the overhead is less, let's try
4229 // a larger threshold to unroll more loops.
4230 if (L->getLoopDepth() > 1)
4231 UP.PartialThreshold *= 2;
4232
4233 // Disable partial & runtime unrolling on -Os.
4235
4236 // Apply subtarget-specific unrolling preferences.
4237 switch (ST->getProcFamily()) {
4238 case AArch64Subtarget::AppleA14:
4239 case AArch64Subtarget::AppleA15:
4240 case AArch64Subtarget::AppleA16:
4241 case AArch64Subtarget::AppleM4:
4242 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4243 break;
4244 case AArch64Subtarget::Falkor:
4247 break;
4248 default:
4249 break;
4250 }
4251
4252 // Scan the loop: don't unroll loops with calls as this could prevent
4253 // inlining. Don't unroll vector loops either, as they don't benefit much from
4254 // unrolling.
4255 for (auto *BB : L->getBlocks()) {
4256 for (auto &I : *BB) {
4257 // Don't unroll vectorised loop.
4258 if (I.getType()->isVectorTy())
4259 return;
4260
4261 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4262 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4263 if (!isLoweredToCall(F))
4264 continue;
4265 }
4266 return;
4267 }
4268 }
4269 }
4270
4271 // Enable runtime unrolling for in-order models
4272 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4273 // checking for that case, we can ensure that the default behaviour is
4274 // unchanged
4276 !ST->getSchedModel().isOutOfOrder()) {
4277 UP.Runtime = true;
4278 UP.Partial = true;
4279 UP.UnrollRemainder = true;
4281
4282 UP.UnrollAndJam = true;
4284 }
4285}
4286
4290}
4291
4293 Type *ExpectedType) {
4294 switch (Inst->getIntrinsicID()) {
4295 default:
4296 return nullptr;
4297 case Intrinsic::aarch64_neon_st2:
4298 case Intrinsic::aarch64_neon_st3:
4299 case Intrinsic::aarch64_neon_st4: {
4300 // Create a struct type
4301 StructType *ST = dyn_cast<StructType>(ExpectedType);
4302 if (!ST)
4303 return nullptr;
4304 unsigned NumElts = Inst->arg_size() - 1;
4305 if (ST->getNumElements() != NumElts)
4306 return nullptr;
4307 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4308 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4309 return nullptr;
4310 }
4311 Value *Res = PoisonValue::get(ExpectedType);
4312 IRBuilder<> Builder(Inst);
4313 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4314 Value *L = Inst->getArgOperand(i);
4315 Res = Builder.CreateInsertValue(Res, L, i);
4316 }
4317 return Res;
4318 }
4319 case Intrinsic::aarch64_neon_ld2:
4320 case Intrinsic::aarch64_neon_ld3:
4321 case Intrinsic::aarch64_neon_ld4:
4322 if (Inst->getType() == ExpectedType)
4323 return Inst;
4324 return nullptr;
4325 }
4326}
4327
4329 MemIntrinsicInfo &Info) {
4330 switch (Inst->getIntrinsicID()) {
4331 default:
4332 break;
4333 case Intrinsic::aarch64_neon_ld2:
4334 case Intrinsic::aarch64_neon_ld3:
4335 case Intrinsic::aarch64_neon_ld4:
4336 Info.ReadMem = true;
4337 Info.WriteMem = false;
4338 Info.PtrVal = Inst->getArgOperand(0);
4339 break;
4340 case Intrinsic::aarch64_neon_st2:
4341 case Intrinsic::aarch64_neon_st3:
4342 case Intrinsic::aarch64_neon_st4:
4343 Info.ReadMem = false;
4344 Info.WriteMem = true;
4345 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
4346 break;
4347 }
4348
4349 switch (Inst->getIntrinsicID()) {
4350 default:
4351 return false;
4352 case Intrinsic::aarch64_neon_ld2:
4353 case Intrinsic::aarch64_neon_st2:
4354 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4355 break;
4356 case Intrinsic::aarch64_neon_ld3:
4357 case Intrinsic::aarch64_neon_st3:
4358 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4359 break;
4360 case Intrinsic::aarch64_neon_ld4:
4361 case Intrinsic::aarch64_neon_st4:
4362 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4363 break;
4364 }
4365 return true;
4366}
4367
4368/// See if \p I should be considered for address type promotion. We check if \p
4369/// I is a sext with right type and used in memory accesses. If it used in a
4370/// "complex" getelementptr, we allow it to be promoted without finding other
4371/// sext instructions that sign extended the same initial value. A getelementptr
4372/// is considered as "complex" if it has more than 2 operands.
4374 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
4375 bool Considerable = false;
4376 AllowPromotionWithoutCommonHeader = false;
4377 if (!isa<SExtInst>(&I))
4378 return false;
4379 Type *ConsideredSExtType =
4380 Type::getInt64Ty(I.getParent()->getParent()->getContext());
4381 if (I.getType() != ConsideredSExtType)
4382 return false;
4383 // See if the sext is the one with the right type and used in at least one
4384 // GetElementPtrInst.
4385 for (const User *U : I.users()) {
4386 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
4387 Considerable = true;
4388 // A getelementptr is considered as "complex" if it has more than 2
4389 // operands. We will promote a SExt used in such complex GEP as we
4390 // expect some computation to be merged if they are done on 64 bits.
4391 if (GEPInst->getNumOperands() > 2) {
4392 AllowPromotionWithoutCommonHeader = true;
4393 break;
4394 }
4395 }
4396 }
4397 return Considerable;
4398}
4399
4401 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
4402 if (!VF.isScalable())
4403 return true;
4404
4405 Type *Ty = RdxDesc.getRecurrenceType();
4407 return false;
4408
4409 switch (RdxDesc.getRecurrenceKind()) {
4410 case RecurKind::Add:
4411 case RecurKind::FAdd:
4412 case RecurKind::And:
4413 case RecurKind::Or:
4414 case RecurKind::Xor:
4415 case RecurKind::SMin:
4416 case RecurKind::SMax:
4417 case RecurKind::UMin:
4418 case RecurKind::UMax:
4419 case RecurKind::FMin:
4420 case RecurKind::FMax:
4421 case RecurKind::FMulAdd:
4422 case RecurKind::IAnyOf:
4423 case RecurKind::FAnyOf:
4424 return true;
4425 default:
4426 return false;
4427 }
4428}
4429
4432 FastMathFlags FMF,
4434 // The code-generator is currently not able to handle scalable vectors
4435 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4436 // it. This change will be removed when code-generation for these types is
4437 // sufficiently reliable.
4438 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4439 if (VTy->getElementCount() == ElementCount::getScalable(1))
4441
4442 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4443
4444 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4445 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
4446
4447 InstructionCost LegalizationCost = 0;
4448 if (LT.first > 1) {
4449 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
4450 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
4451 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
4452 }
4453
4454 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
4455}
4456
4458 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
4459 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4460 InstructionCost LegalizationCost = 0;
4461 if (LT.first > 1) {
4462 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
4463 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
4464 LegalizationCost *= LT.first - 1;
4465 }
4466
4467 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4468 assert(ISD && "Invalid opcode");
4469 // Add the final reduction cost for the legal horizontal reduction
4470 switch (ISD) {
4471 case ISD::ADD:
4472 case ISD::AND:
4473 case ISD::OR:
4474 case ISD::XOR:
4475 case ISD::FADD:
4476 return LegalizationCost + 2;
4477 default:
4479 }
4480}
4481
4484 std::optional<FastMathFlags> FMF,
4486 // The code-generator is currently not able to handle scalable vectors
4487 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4488 // it. This change will be removed when code-generation for these types is
4489 // sufficiently reliable.
4490 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4491 if (VTy->getElementCount() == ElementCount::getScalable(1))
4493
4495 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4496 InstructionCost BaseCost =
4497 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4498 // Add on extra cost to reflect the extra overhead on some CPUs. We still
4499 // end up vectorizing for more computationally intensive loops.
4500 return BaseCost + FixedVTy->getNumElements();
4501 }
4502
4503 if (Opcode != Instruction::FAdd)
4505
4506 auto *VTy = cast<ScalableVectorType>(ValTy);
4508 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
4509 Cost *= getMaxNumElements(VTy->getElementCount());
4510 return Cost;
4511 }
4512
4513 if (isa<ScalableVectorType>(ValTy))
4514 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
4515
4516 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4517 MVT MTy = LT.second;
4518 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4519 assert(ISD && "Invalid opcode");
4520
4521 // Horizontal adds can use the 'addv' instruction. We model the cost of these
4522 // instructions as twice a normal vector add, plus 1 for each legalization
4523 // step (LT.first). This is the only arithmetic vector reduction operation for
4524 // which we have an instruction.
4525 // OR, XOR and AND costs should match the codegen from:
4526 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
4527 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4528 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4529 static const CostTblEntry CostTblNoPairwise[]{
4530 {ISD::ADD, MVT::v8i8, 2},
4531 {ISD::ADD, MVT::v16i8, 2},
4532 {ISD::ADD, MVT::v4i16, 2},
4533 {ISD::ADD, MVT::v8i16, 2},
4534 {ISD::ADD, MVT::v4i32, 2},
4535 {ISD::ADD, MVT::v2i64, 2},
4536 {ISD::OR, MVT::v8i8, 15},
4537 {ISD::OR, MVT::v16i8, 17},
4538 {ISD::OR, MVT::v4i16, 7},
4539 {ISD::OR, MVT::v8i16, 9},
4540 {ISD::OR, MVT::v2i32, 3},
4541 {ISD::OR, MVT::v4i32, 5},
4542 {ISD::OR, MVT::v2i64, 3},
4543 {ISD::XOR, MVT::v8i8, 15},
4544 {ISD::XOR, MVT::v16i8, 17},
4545 {ISD::XOR, MVT::v4i16, 7},
4546 {ISD::XOR, MVT::v8i16, 9},
4547 {ISD::XOR, MVT::v2i32, 3},
4548 {ISD::XOR, MVT::v4i32, 5},
4549 {ISD::XOR, MVT::v2i64, 3},
4550 {ISD::AND, MVT::v8i8, 15},
4551 {ISD::AND, MVT::v16i8, 17},
4552 {ISD::AND, MVT::v4i16, 7},
4553 {ISD::AND, MVT::v8i16, 9},
4554 {ISD::AND, MVT::v2i32, 3},
4555 {ISD::AND, MVT::v4i32, 5},
4556 {ISD::AND, MVT::v2i64, 3},
4557 };
4558 switch (ISD) {
4559 default:
4560 break;
4561 case ISD::FADD:
4562 if (Type *EltTy = ValTy->getScalarType();
4563 // FIXME: For half types without fullfp16 support, this could extend and
4564 // use a fp32 faddp reduction but current codegen unrolls.
4565 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4566 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4567 const unsigned NElts = MTy.getVectorNumElements();
4568 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
4569 isPowerOf2_32(NElts))
4570 // Reduction corresponding to series of fadd instructions is lowered to
4571 // series of faddp instructions. faddp has latency/throughput that
4572 // matches fadd instruction and hence, every faddp instruction can be
4573 // considered to have a relative cost = 1 with
4574 // CostKind = TCK_RecipThroughput.
4575 // An faddp will pairwise add vector elements, so the size of input
4576 // vector reduces by half every time, requiring
4577 // #(faddp instructions) = log2_32(NElts).
4578 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
4579 }
4580 break;
4581 case ISD::ADD:
4582 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4583 return (LT.first - 1) + Entry->Cost;
4584 break;
4585 case ISD::XOR:
4586 case ISD::AND:
4587 case ISD::OR:
4588 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4589 if (!Entry)
4590 break;
4591 auto *ValVTy = cast<FixedVectorType>(ValTy);
4592 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4593 isPowerOf2_32(ValVTy->getNumElements())) {
4594 InstructionCost ExtraCost = 0;
4595 if (LT.first != 1) {
4596 // Type needs to be split, so there is an extra cost of LT.first - 1
4597 // arithmetic ops.
4598 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4599 MTy.getVectorNumElements());
4600 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4601 ExtraCost *= LT.first - 1;
4602 }
4603 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4604 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4605 return Cost + ExtraCost;
4606 }
4607 break;
4608 }
4609 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4610}
4611
4613 static const CostTblEntry ShuffleTbl[] = {
4614 { TTI::SK_Splice, MVT::nxv16i8, 1 },
4615 { TTI::SK_Splice, MVT::nxv8i16, 1 },
4616 { TTI::SK_Splice, MVT::nxv4i32, 1 },
4617 { TTI::SK_Splice, MVT::nxv2i64, 1 },
4618 { TTI::SK_Splice, MVT::nxv2f16, 1 },
4619 { TTI::SK_Splice, MVT::nxv4f16, 1 },
4620 { TTI::SK_Splice, MVT::nxv8f16, 1 },
4621 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4622 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4623 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4624 { TTI::SK_Splice, MVT::nxv2f32, 1 },
4625 { TTI::SK_Splice, MVT::nxv4f32, 1 },
4626 { TTI::SK_Splice, MVT::nxv2f64, 1 },
4627 };
4628
4629 // The code-generator is currently not able to handle scalable vectors
4630 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4631 // it. This change will be removed when code-generation for these types is
4632 // sufficiently reliable.
4635
4636 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4637 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4639 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4640 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4641 : LT.second;
4642 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4643 InstructionCost LegalizationCost = 0;
4644 if (Index < 0) {
4645 LegalizationCost =
4646 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4648 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4650 }
4651
4652 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4653 // Cost performed on a promoted type.
4654 if (LT.second.getScalarType() == MVT::i1) {
4655 LegalizationCost +=
4656 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4658 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4660 }
4661 const auto *Entry =
4662 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4663 assert(Entry && "Illegal Type for Splice");
4664 LegalizationCost += Entry->Cost;
4665 return LegalizationCost * LT.first;
4666}
4667
4669 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
4672 std::optional<unsigned> BinOp) const {
4675
4676 if (Opcode != Instruction::Add)
4677 return Invalid;
4678
4679 if (InputTypeA != InputTypeB)
4680 return Invalid;
4681
4682 EVT InputEVT = EVT::getEVT(InputTypeA);
4683 EVT AccumEVT = EVT::getEVT(AccumType);
4684
4685 if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
4686 return Invalid;
4687 if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
4688 return Invalid;
4689
4690 if (InputEVT == MVT::i8) {
4691 switch (VF.getKnownMinValue()) {
4692 default:
4693 return Invalid;
4694 case 8:
4695 if (AccumEVT == MVT::i32)
4696 Cost *= 2;
4697 else if (AccumEVT != MVT::i64)
4698 return Invalid;
4699 break;
4700 case 16:
4701 if (AccumEVT == MVT::i64)
4702 Cost *= 2;
4703 else if (AccumEVT != MVT::i32)
4704 return Invalid;
4705 break;
4706 }
4707 } else if (InputEVT == MVT::i16) {
4708 // FIXME: Allow i32 accumulator but increase cost, as we would extend
4709 // it to i64.
4710 if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
4711 return Invalid;
4712 } else
4713 return Invalid;
4714
4715 // AArch64 supports lowering mixed extensions to a usdot but only if the
4716 // i8mm or sve/streaming features are available.
4717 if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
4718 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
4720 return Invalid;
4721
4722 if (!BinOp || *BinOp != Instruction::Mul)
4723 return Invalid;
4724
4725 return Cost;
4726}
4727
4730 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4731 ArrayRef<const Value *> Args, const Instruction *CxtI) {
4732 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4733
4734 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4735 // into smaller vectors and sum the cost of each shuffle.
4736 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4737 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4738 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4739
4740 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4741 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4742 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4743 // cost than just the load.
4744 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4747 return std::max<InstructionCost>(1, LT.first / 4);
4748
4749 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4750 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4751 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4752 // cost than just the store.
4753 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4755 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4757 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4758 return LT.first;
4759
4760 unsigned TpNumElts = Mask.size();
4761 unsigned LTNumElts = LT.second.getVectorNumElements();
4762 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4763 VectorType *NTp =
4764 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4766 for (unsigned N = 0; N < NumVecs; N++) {
4767 SmallVector<int> NMask;
4768 // Split the existing mask into chunks of size LTNumElts. Track the source
4769 // sub-vectors to ensure the result has at most 2 inputs.
4770 unsigned Source1, Source2;
4771 unsigned NumSources = 0;
4772 for (unsigned E = 0; E < LTNumElts; E++) {
4773 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4775 if (MaskElt < 0) {
4777 continue;
4778 }
4779
4780 // Calculate which source from the input this comes from and whether it
4781 // is new to us.
4782 unsigned Source = MaskElt / LTNumElts;
4783 if (NumSources == 0) {
4784 Source1 = Source;
4785 NumSources = 1;
4786 } else if (NumSources == 1 && Source != Source1) {
4787 Source2 = Source;
4788 NumSources = 2;
4789 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4790 NumSources++;
4791 }
4792
4793 // Add to the new mask. For the NumSources>2 case these are not correct,
4794 // but are only used for the modular lane number.
4795 if (Source == Source1)
4796 NMask.push_back(MaskElt % LTNumElts);
4797 else if (Source == Source2)
4798 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4799 else
4800 NMask.push_back(MaskElt % LTNumElts);
4801 }
4802 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4803 // getShuffleCost. If not then cost it using the worst case as the number
4804 // of element moves into a new vector.
4805 if (NumSources <= 2)
4806 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4808 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4809 else
4810 Cost += LTNumElts;
4811 }
4812 return Cost;
4813 }
4814
4815 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4816 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4817 // A subvector extract can be implemented with an ext (or trivial extract, if
4818 // from lane 0). This currently only handles low or high extracts to prevent
4819 // SLP vectorizer regressions.
4820 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
4821 if (LT.second.is128BitVector() &&
4822 cast<FixedVectorType>(SubTp)->getNumElements() ==
4823 LT.second.getVectorNumElements() / 2) {
4824 if (Index == 0)
4825 return 0;
4826 if (Index == (int)LT.second.getVectorNumElements() / 2)
4827 return 1;
4828 }
4830 }
4831
4832 // Check for broadcast loads, which are supported by the LD1R instruction.
4833 // In terms of code-size, the shuffle vector is free when a load + dup get
4834 // folded into a LD1R. That's what we check and return here. For performance
4835 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4836 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4837 // that we model the load + dup sequence slightly higher because LD1R is a
4838 // high latency instruction.
4839 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4840 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4841 if (IsLoad && LT.second.isVector() &&
4843 LT.second.getVectorElementCount()))
4844 return 0;
4845 }
4846
4847 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4848 // from the perfect shuffle tables.
4849 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4850 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4851 all_of(Mask, [](int E) { return E < 8; }))
4852 return getPerfectShuffleCost(Mask);
4853
4854 // Check for identity masks, which we can treat as free.
4855 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4856 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4857 all_of(enumerate(Mask), [](const auto &M) {
4858 return M.value() < 0 || M.value() == (int)M.index();
4859 }))
4860 return 0;
4861
4862 // Check for other shuffles that are not SK_ kinds but we have native
4863 // instructions for, for example ZIP and UZP.
4864 unsigned Unused;
4865 if (LT.second.isFixedLengthVector() &&
4866 LT.second.getVectorNumElements() == Mask.size() &&
4867 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4868 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4869 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4870 // Check for non-zero lane splats
4871 all_of(drop_begin(Mask),
4872 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4873 return 1;
4874
4875 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4876 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4877 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4878 static const CostTblEntry ShuffleTbl[] = {
4879 // Broadcast shuffle kinds can be performed with 'dup'.
4880 {TTI::SK_Broadcast, MVT::v8i8, 1},
4881 {TTI::SK_Broadcast, MVT::v16i8, 1},
4882 {TTI::SK_Broadcast, MVT::v4i16, 1},
4883 {TTI::SK_Broadcast, MVT::v8i16, 1},
4884 {TTI::SK_Broadcast, MVT::v2i32, 1},
4885 {TTI::SK_Broadcast, MVT::v4i32, 1},
4886 {TTI::SK_Broadcast, MVT::v2i64, 1},
4887 {TTI::SK_Broadcast, MVT::v4f16, 1},
4888 {TTI::SK_Broadcast, MVT::v8f16, 1},
4889 {TTI::SK_Broadcast, MVT::v2f32, 1},
4890 {TTI::SK_Broadcast, MVT::v4f32, 1},
4891 {TTI::SK_Broadcast, MVT::v2f64, 1},
4892 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4893 // 'zip1/zip2' instructions.
4894 {TTI::SK_Transpose, MVT::v8i8, 1},
4895 {TTI::SK_Transpose, MVT::v16i8, 1},
4896 {TTI::SK_Transpose, MVT::v4i16, 1},
4897 {TTI::SK_Transpose, MVT::v8i16, 1},
4898 {TTI::SK_Transpose, MVT::v2i32, 1},
4899 {TTI::SK_Transpose, MVT::v4i32, 1},
4900 {TTI::SK_Transpose, MVT::v2i64, 1},
4901 {TTI::SK_Transpose, MVT::v4f16, 1},
4902 {TTI::SK_Transpose, MVT::v8f16, 1},
4903 {TTI::SK_Transpose, MVT::v2f32, 1},
4904 {TTI::SK_Transpose, MVT::v4f32, 1},
4905 {TTI::SK_Transpose, MVT::v2f64, 1},
4906 // Select shuffle kinds.
4907 // TODO: handle vXi8/vXi16.
4908 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4909 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4910 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4911 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4912 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4913 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4914 // PermuteSingleSrc shuffle kinds.
4915 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4916 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4917 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4918 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4919 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4920 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4921 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4922 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4923 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4924 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4925 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4926 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4927 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4928 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4929 // Reverse can be lowered with `rev`.
4930 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4931 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4932 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4933 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4934 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4935 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4936 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4937 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4938 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4939 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4940 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4941 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4942 // Splice can all be lowered as `ext`.
4943 {TTI::SK_Splice, MVT::v2i32, 1},
4944 {TTI::SK_Splice, MVT::v4i32, 1},
4945 {TTI::SK_Splice, MVT::v2i64, 1},
4946 {TTI::SK_Splice, MVT::v2f32, 1},
4947 {TTI::SK_Splice, MVT::v4f32, 1},
4948 {TTI::SK_Splice, MVT::v2f64, 1},
4949 {TTI::SK_Splice, MVT::v8f16, 1},
4950 {TTI::SK_Splice, MVT::v8bf16, 1},
4951 {TTI::SK_Splice, MVT::v8i16, 1},
4952 {TTI::SK_Splice, MVT::v16i8, 1},
4953 {TTI::SK_Splice, MVT::v4bf16, 1},
4954 {TTI::SK_Splice, MVT::v4f16, 1},
4955 {TTI::SK_Splice, MVT::v4i16, 1},
4956 {TTI::SK_Splice, MVT::v8i8, 1},
4957 // Broadcast shuffle kinds for scalable vectors
4958 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4959 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4960 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4961 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4962 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4963 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4964 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4965 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4966 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4967 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4968 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4969 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4970 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4971 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4972 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4973 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4974 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4975 // Handle the cases for vector.reverse with scalable vectors
4976 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4977 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4978 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4979 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4980 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4981 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4982 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4983 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4984 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4985 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4986 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4987 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4988 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4989 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4990 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4991 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4992 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4993 };
4994 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4995 return LT.first * Entry->Cost;
4996 }
4997
4998 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4999 return getSpliceCost(Tp, Index);
5000
5001 // Inserting a subvector can often be done with either a D, S or H register
5002 // move, so long as the inserted vector is "aligned".
5003 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5004 LT.second.getSizeInBits() <= 128 && SubTp) {
5005 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
5006 if (SubLT.second.isVector()) {
5007 int NumElts = LT.second.getVectorNumElements();
5008 int NumSubElts = SubLT.second.getVectorNumElements();
5009 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5010 return SubLT.first;
5011 }
5012 }
5013
5014 // Restore optimal kind.
5015 if (IsExtractSubvector)
5017 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
5018 CxtI);
5019}
5020
5023 const auto &Strides = DenseMap<Value *, const SCEV *>();
5024 for (BasicBlock *BB : TheLoop->blocks()) {
5025 // Scan the instructions in the block and look for addresses that are
5026 // consecutive and decreasing.
5027 for (Instruction &I : *BB) {
5028 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
5030 Type *AccessTy = getLoadStoreType(&I);
5031 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
5032 /*ShouldCheckWrap=*/false)
5033 .value_or(0) < 0)
5034 return true;
5035 }
5036 }
5037 }
5038 return false;
5039}
5040
5044 return ST->useFixedOverScalableIfEqualCost();
5045}
5046
5048 return ST->getEpilogueVectorizationMinVF();
5049}
5050
5052 if (!ST->hasSVE())
5053 return false;
5054
5055 // We don't currently support vectorisation with interleaving for SVE - with
5056 // such loops we're better off not using tail-folding. This gives us a chance
5057 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
5058 if (TFI->IAI->hasGroups())
5059 return false;
5060
5062 if (TFI->LVL->getReductionVars().size())
5063 Required |= TailFoldingOpts::Reductions;
5064 if (TFI->LVL->getFixedOrderRecurrences().size())
5065 Required |= TailFoldingOpts::Recurrences;
5066
5067 // We call this to discover whether any load/store pointers in the loop have
5068 // negative strides. This will require extra work to reverse the loop
5069 // predicate, which may be expensive.
5072 Required |= TailFoldingOpts::Reverse;
5073 if (Required == TailFoldingOpts::Disabled)
5074 Required |= TailFoldingOpts::Simple;
5075
5077 Required))
5078 return false;
5079
5080 // Don't tail-fold for tight loops where we would be better off interleaving
5081 // with an unpredicated loop.
5082 unsigned NumInsns = 0;
5083 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5084 NumInsns += BB->sizeWithoutDebug();
5085 }
5086
5087 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5088 return NumInsns >= SVETailFoldInsnThreshold;
5089}
5090
5093 StackOffset BaseOffset, bool HasBaseReg,
5094 int64_t Scale, unsigned AddrSpace) const {
5095 // Scaling factors are not free at all.
5096 // Operands | Rt Latency
5097 // -------------------------------------------
5098 // Rt, [Xn, Xm] | 4
5099 // -------------------------------------------
5100 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
5101 // Rt, [Xn, Wm, <extend> #imm] |
5103 AM.BaseGV = BaseGV;
5104 AM.BaseOffs = BaseOffset.getFixed();
5105 AM.HasBaseReg = HasBaseReg;
5106 AM.Scale = Scale;
5107 AM.ScalableOffset = BaseOffset.getScalable();
5108 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
5109 // Scale represents reg2 * scale, thus account for 1 if
5110 // it is not equal to 0 or 1.
5111 return AM.Scale != 0 && AM.Scale != 1;
5112 return -1;
5113}
5114
5117 // For the binary operators (e.g. or) we need to be more careful than
5118 // selects, here we only transform them if they are already at a natural
5119 // break point in the code - the end of a block with an unconditional
5120 // terminator.
5121 if (I->getOpcode() == Instruction::Or &&
5122 isa<BranchInst>(I->getNextNode()) &&
5123 cast<BranchInst>(I->getNextNode())->isUnconditional())
5124 return true;
5125
5126 if (I->getOpcode() == Instruction::Add ||
5127 I->getOpcode() == Instruction::Sub)
5128 return true;
5129 }
5131}
5132
5134 const TargetTransformInfo::LSRCost &C2) {
5135 // AArch64 specific here is adding the number of instructions to the
5136 // comparison (though not as the first consideration, as some targets do)
5137 // along with changing the priority of the base additions.
5138 // TODO: Maybe a more nuanced tradeoff between instruction count
5139 // and number of registers? To be investigated at a later date.
5140 if (EnableLSRCostOpt)
5141 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
5142 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5143 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
5144 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5145
5147}
5148
5149static bool isSplatShuffle(Value *V) {
5150 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
5151 return all_equal(Shuf->getShuffleMask());
5152 return false;
5153}
5154
5155/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5156/// or upper half of the vector elements.
5157static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5158 bool AllowSplat = false) {
5159 // Scalable types can't be extract shuffle vectors.
5160 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5161 return false;
5162
5163 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5164 auto *FullTy = FullV->getType();
5165 auto *HalfTy = HalfV->getType();
5166 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5167 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5168 };
5169
5170 auto extractHalf = [](Value *FullV, Value *HalfV) {
5171 auto *FullVT = cast<FixedVectorType>(FullV->getType());
5172 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5173 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5174 };
5175
5176 ArrayRef<int> M1, M2;
5177 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5178 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5179 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5180 return false;
5181
5182 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5183 // it is not checked as an extract below.
5184 if (AllowSplat && isSplatShuffle(Op1))
5185 S1Op1 = nullptr;
5186 if (AllowSplat && isSplatShuffle(Op2))
5187 S2Op1 = nullptr;
5188
5189 // Check that the operands are half as wide as the result and we extract
5190 // half of the elements of the input vectors.
5191 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5192 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5193 return false;
5194
5195 // Check the mask extracts either the lower or upper half of vector
5196 // elements.
5197 int M1Start = 0;
5198 int M2Start = 0;
5199 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5200 if ((S1Op1 &&
5201 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5202 (S2Op1 &&
5203 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5204 return false;
5205
5206 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5207 (M2Start != 0 && M2Start != (NumElements / 2)))
5208 return false;
5209 if (S1Op1 && S2Op1 && M1Start != M2Start)
5210 return false;
5211
5212 return true;
5213}
5214
5215/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5216/// of the vector elements.
5217static bool areExtractExts(Value *Ext1, Value *Ext2) {
5218 auto areExtDoubled = [](Instruction *Ext) {
5219 return Ext->getType()->getScalarSizeInBits() ==
5220 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5221 };
5222
5223 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5224 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5225 !areExtDoubled(cast<Instruction>(Ext1)) ||
5226 !areExtDoubled(cast<Instruction>(Ext2)))
5227 return false;
5228
5229 return true;
5230}
5231
5232/// Check if Op could be used with vmull_high_p64 intrinsic.
5234 Value *VectorOperand = nullptr;
5235 ConstantInt *ElementIndex = nullptr;
5236 return match(Op, m_ExtractElt(m_Value(VectorOperand),
5237 m_ConstantInt(ElementIndex))) &&
5238 ElementIndex->getValue() == 1 &&
5239 isa<FixedVectorType>(VectorOperand->getType()) &&
5240 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5241}
5242
5243/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5244static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5246}
5247
5249 // Restrict ourselves to the form CodeGenPrepare typically constructs.
5250 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5251 if (!GEP || GEP->getNumOperands() != 2)
5252 return false;
5253
5254 Value *Base = GEP->getOperand(0);
5255 Value *Offsets = GEP->getOperand(1);
5256
5257 // We only care about scalar_base+vector_offsets.
5258 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5259 return false;
5260
5261 // Sink extends that would allow us to use 32-bit offset vectors.
5262 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5263 auto *OffsetsInst = cast<Instruction>(Offsets);
5264 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5265 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5266 Ops.push_back(&GEP->getOperandUse(1));
5267 }
5268
5269 // Sink the GEP.
5270 return true;
5271}
5272
5273/// We want to sink following cases:
5274/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5275/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5277 if (match(Op, m_VScale()))
5278 return true;
5279 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5281 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5282 return true;
5283 }
5284 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5286 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5287 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5288 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5289 return true;
5290 }
5291 return false;
5292}
5293
5294/// Check if sinking \p I's operands to I's basic block is profitable, because
5295/// the operands can be folded into a target instruction, e.g.
5296/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5298 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5299 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5300 switch (II->getIntrinsicID()) {
5301 case Intrinsic::aarch64_neon_smull:
5302 case Intrinsic::aarch64_neon_umull:
5303 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
5304 /*AllowSplat=*/true)) {
5305 Ops.push_back(&II->getOperandUse(0));
5306 Ops.push_back(&II->getOperandUse(1));
5307 return true;
5308 }
5309 [[fallthrough]];
5310
5311 case Intrinsic::fma:
5312 case Intrinsic::fmuladd:
5313 if (isa<VectorType>(I->getType()) &&
5314 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5315 !ST->hasFullFP16())
5316 return false;
5317 [[fallthrough]];
5318 case Intrinsic::aarch64_neon_sqdmull:
5319 case Intrinsic::aarch64_neon_sqdmulh:
5320 case Intrinsic::aarch64_neon_sqrdmulh:
5321 // Sink splats for index lane variants
5322 if (isSplatShuffle(II->getOperand(0)))
5323 Ops.push_back(&II->getOperandUse(0));
5324 if (isSplatShuffle(II->getOperand(1)))
5325 Ops.push_back(&II->getOperandUse(1));
5326 return !Ops.empty();
5327 case Intrinsic::aarch64_neon_fmlal:
5328 case Intrinsic::aarch64_neon_fmlal2:
5329 case Intrinsic::aarch64_neon_fmlsl:
5330 case Intrinsic::aarch64_neon_fmlsl2:
5331 // Sink splats for index lane variants
5332 if (isSplatShuffle(II->getOperand(1)))
5333 Ops.push_back(&II->getOperandUse(1));
5334 if (isSplatShuffle(II->getOperand(2)))
5335 Ops.push_back(&II->getOperandUse(2));
5336 return !Ops.empty();
5337 case Intrinsic::aarch64_sve_ptest_first:
5338 case Intrinsic::aarch64_sve_ptest_last:
5339 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
5340 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
5341 Ops.push_back(&II->getOperandUse(0));
5342 return !Ops.empty();
5343 case Intrinsic::aarch64_sme_write_horiz:
5344 case Intrinsic::aarch64_sme_write_vert:
5345 case Intrinsic::aarch64_sme_writeq_horiz:
5346 case Intrinsic::aarch64_sme_writeq_vert: {
5347 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
5348 if (!Idx || Idx->getOpcode() != Instruction::Add)
5349 return false;
5350 Ops.push_back(&II->getOperandUse(1));
5351 return true;
5352 }
5353 case Intrinsic::aarch64_sme_read_horiz:
5354 case Intrinsic::aarch64_sme_read_vert:
5355 case Intrinsic::aarch64_sme_readq_horiz:
5356 case Intrinsic::aarch64_sme_readq_vert:
5357 case Intrinsic::aarch64_sme_ld1b_vert:
5358 case Intrinsic::aarch64_sme_ld1h_vert:
5359 case Intrinsic::aarch64_sme_ld1w_vert:
5360 case Intrinsic::aarch64_sme_ld1d_vert:
5361 case Intrinsic::aarch64_sme_ld1q_vert:
5362 case Intrinsic::aarch64_sme_st1b_vert:
5363 case Intrinsic::aarch64_sme_st1h_vert:
5364 case Intrinsic::aarch64_sme_st1w_vert:
5365 case Intrinsic::aarch64_sme_st1d_vert:
5366 case Intrinsic::aarch64_sme_st1q_vert:
5367 case Intrinsic::aarch64_sme_ld1b_horiz:
5368 case Intrinsic::aarch64_sme_ld1h_horiz:
5369 case Intrinsic::aarch64_sme_ld1w_horiz:
5370 case Intrinsic::aarch64_sme_ld1d_horiz:
5371 case Intrinsic::aarch64_sme_ld1q_horiz:
5372 case Intrinsic::aarch64_sme_st1b_horiz:
5373 case Intrinsic::aarch64_sme_st1h_horiz:
5374 case Intrinsic::aarch64_sme_st1w_horiz:
5375 case Intrinsic::aarch64_sme_st1d_horiz:
5376 case Intrinsic::aarch64_sme_st1q_horiz: {
5377 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
5378 if (!Idx || Idx->getOpcode() != Instruction::Add)
5379 return false;
5380 Ops.push_back(&II->getOperandUse(3));
5381 return true;
5382 }
5383 case Intrinsic::aarch64_neon_pmull:
5384 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
5385 return false;
5386 Ops.push_back(&II->getOperandUse(0));
5387 Ops.push_back(&II->getOperandUse(1));
5388 return true;
5389 case Intrinsic::aarch64_neon_pmull64:
5390 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
5391 II->getArgOperand(1)))
5392 return false;
5393 Ops.push_back(&II->getArgOperandUse(0));
5394 Ops.push_back(&II->getArgOperandUse(1));
5395 return true;
5396 case Intrinsic::masked_gather:
5397 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
5398 return false;
5399 Ops.push_back(&II->getArgOperandUse(0));
5400 return true;
5401 case Intrinsic::masked_scatter:
5402 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
5403 return false;
5404 Ops.push_back(&II->getArgOperandUse(1));
5405 return true;
5406 default:
5407 return false;
5408 }
5409 }
5410
5411 auto ShouldSinkCondition = [](Value *Cond) -> bool {
5412 auto *II = dyn_cast<IntrinsicInst>(Cond);
5413 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5414 isa<ScalableVectorType>(II->getOperand(0)->getType());
5415 };
5416
5417 switch (I->getOpcode()) {
5418 case Instruction::GetElementPtr:
5419 case Instruction::Add:
5420 case Instruction::Sub:
5421 // Sink vscales closer to uses for better isel
5422 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
5423 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
5424 Ops.push_back(&I->getOperandUse(Op));
5425 return true;
5426 }
5427 }
5428 break;
5429 case Instruction::Select: {
5430 if (!ShouldSinkCondition(I->getOperand(0)))
5431 return false;
5432
5433 Ops.push_back(&I->getOperandUse(0));
5434 return true;
5435 }
5436 case Instruction::Br: {
5437 if (cast<BranchInst>(I)->isUnconditional())
5438 return false;
5439
5440 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5441 return false;
5442
5443 Ops.push_back(&I->getOperandUse(0));
5444 return true;
5445 }
5446 default:
5447 break;
5448 }
5449
5450 if (!I->getType()->isVectorTy())
5451 return false;
5452
5453 switch (I->getOpcode()) {
5454 case Instruction::Sub:
5455 case Instruction::Add: {
5456 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
5457 return false;
5458
5459 // If the exts' operands extract either the lower or upper elements, we
5460 // can sink them too.
5461 auto Ext1 = cast<Instruction>(I->getOperand(0));
5462 auto Ext2 = cast<Instruction>(I->getOperand(1));
5463 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
5464 Ops.push_back(&Ext1->getOperandUse(0));
5465 Ops.push_back(&Ext2->getOperandUse(0));
5466 }
5467
5468 Ops.push_back(&I->getOperandUse(0));
5469 Ops.push_back(&I->getOperandUse(1));
5470
5471 return true;
5472 }
5473 case Instruction::Or: {
5474 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
5475 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
5476 if (ST->hasNEON()) {
5477 Instruction *OtherAnd, *IA, *IB;
5478 Value *MaskValue;
5479 // MainAnd refers to And instruction that has 'Not' as one of its operands
5480 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
5481 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
5482 m_Instruction(IA)))))) {
5483 if (match(OtherAnd,
5484 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
5485 Instruction *MainAnd = I->getOperand(0) == OtherAnd
5486 ? cast<Instruction>(I->getOperand(1))
5487 : cast<Instruction>(I->getOperand(0));
5488
5489 // Both Ands should be in same basic block as Or
5490 if (I->getParent() != MainAnd->getParent() ||
5491 I->getParent() != OtherAnd->getParent())
5492 return false;
5493
5494 // Non-mask operands of both Ands should also be in same basic block
5495 if (I->getParent() != IA->getParent() ||
5496 I->getParent() != IB->getParent())
5497 return false;
5498
5499 Ops.push_back(
5500 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
5501 Ops.push_back(&I->getOperandUse(0));
5502 Ops.push_back(&I->getOperandUse(1));
5503
5504 return true;
5505 }
5506 }
5507 }
5508
5509 return false;
5510 }
5511 case Instruction::Mul: {
5512 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5513 auto *Ty = cast<VectorType>(V->getType());
5514 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5515 if (Ty->isScalableTy())
5516 return false;
5517
5518 // Indexed variants of Mul exist for i16 and i32 element types only.
5519 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5520 };
5521
5522 int NumZExts = 0, NumSExts = 0;
5523 for (auto &Op : I->operands()) {
5524 // Make sure we are not already sinking this operand
5525 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
5526 continue;
5527
5528 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5529 auto *Ext = cast<Instruction>(Op);
5530 auto *ExtOp = Ext->getOperand(0);
5531 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5532 Ops.push_back(&Ext->getOperandUse(0));
5533 Ops.push_back(&Op);
5534
5535 if (isa<SExtInst>(Ext))
5536 NumSExts++;
5537 else
5538 NumZExts++;
5539
5540 continue;
5541 }
5542
5543 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5544 if (!Shuffle)
5545 continue;
5546
5547 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
5548 // operand and the s/zext can help create indexed s/umull. This is
5549 // especially useful to prevent i64 mul being scalarized.
5550 if (isSplatShuffle(Shuffle) &&
5551 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
5552 Ops.push_back(&Shuffle->getOperandUse(0));
5553 Ops.push_back(&Op);
5554 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
5555 NumSExts++;
5556 else
5557 NumZExts++;
5558 continue;
5559 }
5560
5561 Value *ShuffleOperand = Shuffle->getOperand(0);
5562 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
5563 if (!Insert)
5564 continue;
5565
5566 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
5567 if (!OperandInstr)
5568 continue;
5569
5570 ConstantInt *ElementConstant =
5571 dyn_cast<ConstantInt>(Insert->getOperand(2));
5572 // Check that the insertelement is inserting into element 0
5573 if (!ElementConstant || !ElementConstant->isZero())
5574 continue;
5575
5576 unsigned Opcode = OperandInstr->getOpcode();
5577 if (Opcode == Instruction::SExt)
5578 NumSExts++;
5579 else if (Opcode == Instruction::ZExt)
5580 NumZExts++;
5581 else {
5582 // If we find that the top bits are known 0, then we can sink and allow
5583 // the backend to generate a umull.
5584 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
5585 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
5586 const DataLayout &DL = I->getDataLayout();
5587 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
5588 continue;
5589 NumZExts++;
5590 }
5591
5592 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
5593 // the And, just to hoist it again back to the load.
5594 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
5595 Ops.push_back(&Insert->getOperandUse(1));
5596 Ops.push_back(&Shuffle->getOperandUse(0));
5597 Ops.push_back(&Op);
5598 }
5599
5600 // It is profitable to sink if we found two of the same type of extends.
5601 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5602 return true;
5603
5604 // Otherwise, see if we should sink splats for indexed variants.
5605 if (!ShouldSinkSplatForIndexedVariant(I))
5606 return false;
5607
5608 Ops.clear();
5609 if (isSplatShuffle(I->getOperand(0)))
5610 Ops.push_back(&I->getOperandUse(0));
5611 if (isSplatShuffle(I->getOperand(1)))
5612 Ops.push_back(&I->getOperandUse(1));
5613
5614 return !Ops.empty();
5615 }
5616 case Instruction::FMul: {
5617 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5618 if (I->getType()->isScalableTy())
5619 return false;
5620
5621 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5622 !ST->hasFullFP16())
5623 return false;
5624
5625 // Sink splats for index lane variants
5626 if (isSplatShuffle(I->getOperand(0)))
5627 Ops.push_back(&I->getOperandUse(0));
5628 if (isSplatShuffle(I->getOperand(1)))
5629 Ops.push_back(&I->getOperandUse(1));
5630 return !Ops.empty();
5631 }
5632 default:
5633 return false;
5634 }
5635 return false;
5636}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, bool hasInactiveVector)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getEpilogueVectorizationMinVF() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp) const
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
unsigned getEpilogueVectorizationMinVF() const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
uint64_t getFeatureMask(const Function &F) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isMultiversionedFunction(const Function &F) const
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
unsigned countLeadingOnes() const
Definition: APInt.h:1603
void negate()
Negate this APInt in place.
Definition: APInt.h:1450
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:622
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:306
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:218
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isIntPredicate() const
Definition: InstrTypes.h:781
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1672
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition: IRBuilder.h:92
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:578
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:546
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:563
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2234
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1677
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:566
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:573
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:48
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:291
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:694
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:651
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
This instruction constructs a fixed permutation of two input vectors.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:218
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
int getNumOccurrences() const
Definition: CommandLine.h:399
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
uint64_t getFMVPriority(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:931
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1077
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:298
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition: VE.h:376
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:384
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55