LLVM 20.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
29#include <algorithm>
30#include <optional>
31using namespace llvm;
32using namespace llvm::PatternMatch;
33
34#define DEBUG_TYPE "aarch64tti"
35
36static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37 cl::init(true), cl::Hidden);
38
40 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
42static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
44
45static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46 cl::init(10), cl::Hidden);
47
48static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49 cl::init(15), cl::Hidden);
50
52 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
54
56 "call-penalty-sm-change", cl::init(5), cl::Hidden,
58 "Penalty of calling a function that requires a change to PSTATE.SM"));
59
61 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
62 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
64static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65 cl::init(true), cl::Hidden);
66
67static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68 cl::init(true), cl::Hidden);
69
70// A complete guess as to a reasonable cost.
72 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
73 cl::desc("The cost of a histcnt instruction"));
74
76 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
77 cl::desc("The number of instructions to search for a redundant dmb"));
78
79namespace {
80class TailFoldingOption {
81 // These bitfields will only ever be set to something non-zero in operator=,
82 // when setting the -sve-tail-folding option. This option should always be of
83 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
84 // InitialBits is one of (disabled|all|simple). EnableBits represents
85 // additional flags we're enabling, and DisableBits for those flags we're
86 // disabling. The default flag is tracked in the variable NeedsDefault, since
87 // at the time of setting the option we may not know what the default value
88 // for the CPU is.
89 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93 // This value needs to be initialised to true in case the user does not
94 // explicitly set the -sve-tail-folding option.
95 bool NeedsDefault = true;
96
97 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99 void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101 void setEnableBit(TailFoldingOpts Bit) {
102 EnableBits |= Bit;
103 DisableBits &= ~Bit;
104 }
105
106 void setDisableBit(TailFoldingOpts Bit) {
107 EnableBits &= ~Bit;
108 DisableBits |= Bit;
109 }
110
111 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
115 "Initial bits should only include one of "
116 "(disabled|all|simple|default)");
117 Bits = NeedsDefault ? DefaultBits : InitialBits;
118 Bits |= EnableBits;
119 Bits &= ~DisableBits;
120
121 return Bits;
122 }
123
124 void reportError(std::string Opt) {
125 errs() << "invalid argument '" << Opt
126 << "' to -sve-tail-folding=; the option should be of the form\n"
127 " (disabled|all|default|simple)[+(reductions|recurrences"
128 "|reverse|noreductions|norecurrences|noreverse)]\n";
129 report_fatal_error("Unrecognised tail-folding option");
130 }
131
132public:
133
134 void operator=(const std::string &Val) {
135 // If the user explicitly sets -sve-tail-folding= then treat as an error.
136 if (Val.empty()) {
137 reportError("");
138 return;
139 }
140
141 // Since the user is explicitly setting the option we don't automatically
142 // need the default unless they require it.
143 setNeedsDefault(false);
144
145 SmallVector<StringRef, 4> TailFoldTypes;
146 StringRef(Val).split(TailFoldTypes, '+', -1, false);
147
148 unsigned StartIdx = 1;
149 if (TailFoldTypes[0] == "disabled")
150 setInitialBits(TailFoldingOpts::Disabled);
151 else if (TailFoldTypes[0] == "all")
152 setInitialBits(TailFoldingOpts::All);
153 else if (TailFoldTypes[0] == "default")
154 setNeedsDefault(true);
155 else if (TailFoldTypes[0] == "simple")
156 setInitialBits(TailFoldingOpts::Simple);
157 else {
158 StartIdx = 0;
159 setInitialBits(TailFoldingOpts::Disabled);
160 }
161
162 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163 if (TailFoldTypes[I] == "reductions")
164 setEnableBit(TailFoldingOpts::Reductions);
165 else if (TailFoldTypes[I] == "recurrences")
166 setEnableBit(TailFoldingOpts::Recurrences);
167 else if (TailFoldTypes[I] == "reverse")
168 setEnableBit(TailFoldingOpts::Reverse);
169 else if (TailFoldTypes[I] == "noreductions")
170 setDisableBit(TailFoldingOpts::Reductions);
171 else if (TailFoldTypes[I] == "norecurrences")
172 setDisableBit(TailFoldingOpts::Recurrences);
173 else if (TailFoldTypes[I] == "noreverse")
174 setDisableBit(TailFoldingOpts::Reverse);
175 else
176 reportError(Val);
177 }
178 }
179
180 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181 return (getBits(DefaultBits) & Required) == Required;
182 }
183};
184} // namespace
185
186TailFoldingOption TailFoldingOptionLoc;
187
189 "sve-tail-folding",
190 cl::desc(
191 "Control the use of vectorisation using tail-folding for SVE where the"
192 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
193 "\ndisabled (Initial) No loop types will vectorize using "
194 "tail-folding"
195 "\ndefault (Initial) Uses the default tail-folding settings for "
196 "the target CPU"
197 "\nall (Initial) All legal loop types will vectorize using "
198 "tail-folding"
199 "\nsimple (Initial) Use tail-folding for simple loops (not "
200 "reductions or recurrences)"
201 "\nreductions Use tail-folding for loops containing reductions"
202 "\nnoreductions Inverse of above"
203 "\nrecurrences Use tail-folding for loops containing fixed order "
204 "recurrences"
205 "\nnorecurrences Inverse of above"
206 "\nreverse Use tail-folding for loops requiring reversed "
207 "predicates"
208 "\nnoreverse Inverse of above"),
210
211// Experimental option that will only be fully functional when the
212// code-generator is changed to use SVE instead of NEON for all fixed-width
213// operations.
215 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
216
217// Experimental option that will only be fully functional when the cost-model
218// and code-generator have been changed to avoid using scalable vector
219// instructions that are not legal in streaming SVE mode.
221 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
222
223static bool isSMEABIRoutineCall(const CallInst &CI) {
224 const auto *F = CI.getCalledFunction();
225 return F && StringSwitch<bool>(F->getName())
226 .Case("__arm_sme_state", true)
227 .Case("__arm_tpidr2_save", true)
228 .Case("__arm_tpidr2_restore", true)
229 .Case("__arm_za_disable", true)
230 .Default(false);
231}
232
233/// Returns true if the function has explicit operations that can only be
234/// lowered using incompatible instructions for the selected mode. This also
235/// returns true if the function F may use or modify ZA state.
237 for (const BasicBlock &BB : *F) {
238 for (const Instruction &I : BB) {
239 // Be conservative for now and assume that any call to inline asm or to
240 // intrinsics could could result in non-streaming ops (e.g. calls to
241 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
242 // all native LLVM instructions can be lowered to compatible instructions.
243 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
244 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
245 isSMEABIRoutineCall(cast<CallInst>(I))))
246 return true;
247 }
248 }
249 return false;
250}
251
253 StringRef AttributeStr =
254 isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
257 FeatureStr.split(Features, ",");
258 return AArch64::getFMVPriority(Features);
259}
260
262 return F.hasFnAttribute("fmv-features");
263}
264
266 const Function *Callee) const {
267 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
268
269 // When inlining, we should consider the body of the function, not the
270 // interface.
271 if (CalleeAttrs.hasStreamingBody()) {
272 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
273 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
274 }
275
276 if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
277 return false;
278
279 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
280 CallerAttrs.requiresSMChange(CalleeAttrs) ||
281 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
282 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
283 if (hasPossibleIncompatibleOps(Callee))
284 return false;
285 }
286
287 return BaseT::areInlineCompatible(Caller, Callee);
288}
289
291 const Function *Caller, const Function *Callee,
292 const ArrayRef<Type *> &Types) const {
293 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
294 return false;
295
296 // We need to ensure that argument promotion does not attempt to promote
297 // pointers to fixed-length vector types larger than 128 bits like
298 // <8 x float> (and pointers to aggregate types which have such fixed-length
299 // vector type members) into the values of the pointees. Such vector types
300 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
301 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
302 // types can be safely treated as 128-bit NEON types and they cannot be
303 // distinguished in IR.
304 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
305 auto FVTy = dyn_cast<FixedVectorType>(Ty);
306 return FVTy &&
307 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
308 }))
309 return false;
310
311 return true;
312}
313
314unsigned
316 unsigned DefaultCallPenalty) const {
317 // This function calculates a penalty for executing Call in F.
318 //
319 // There are two ways this function can be called:
320 // (1) F:
321 // call from F -> G (the call here is Call)
322 //
323 // For (1), Call.getCaller() == F, so it will always return a high cost if
324 // a streaming-mode change is required (thus promoting the need to inline the
325 // function)
326 //
327 // (2) F:
328 // call from F -> G (the call here is not Call)
329 // G:
330 // call from G -> H (the call here is Call)
331 //
332 // For (2), if after inlining the body of G into F the call to H requires a
333 // streaming-mode change, and the call to G from F would also require a
334 // streaming-mode change, then there is benefit to do the streaming-mode
335 // change only once and avoid inlining of G into F.
336 SMEAttrs FAttrs(*F);
337 SMEAttrs CalleeAttrs(Call);
338 if (FAttrs.requiresSMChange(CalleeAttrs)) {
339 if (F == Call.getCaller()) // (1)
340 return CallPenaltyChangeSM * DefaultCallPenalty;
341 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
342 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
343 }
344
345 return DefaultCallPenalty;
346}
347
352 ST->isNeonAvailable());
353}
354
355/// Calculate the cost of materializing a 64-bit value. This helper
356/// method might only calculate a fraction of a larger immediate. Therefore it
357/// is valid to return a cost of ZERO.
359 // Check if the immediate can be encoded within an instruction.
360 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
361 return 0;
362
363 if (Val < 0)
364 Val = ~Val;
365
366 // Calculate how many moves we will need to materialize this constant.
369 return Insn.size();
370}
371
372/// Calculate the cost of materializing the given constant.
375 assert(Ty->isIntegerTy());
376
377 unsigned BitSize = Ty->getPrimitiveSizeInBits();
378 if (BitSize == 0)
379 return ~0U;
380
381 // Sign-extend all constants to a multiple of 64-bit.
382 APInt ImmVal = Imm;
383 if (BitSize & 0x3f)
384 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
385
386 // Split the constant into 64-bit chunks and calculate the cost for each
387 // chunk.
389 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
390 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
391 int64_t Val = Tmp.getSExtValue();
392 Cost += getIntImmCost(Val);
393 }
394 // We need at least one instruction to materialze the constant.
395 return std::max<InstructionCost>(1, Cost);
396}
397
399 const APInt &Imm, Type *Ty,
401 Instruction *Inst) {
402 assert(Ty->isIntegerTy());
403
404 unsigned BitSize = Ty->getPrimitiveSizeInBits();
405 // There is no cost model for constants with a bit size of 0. Return TCC_Free
406 // here, so that constant hoisting will ignore this constant.
407 if (BitSize == 0)
408 return TTI::TCC_Free;
409
410 unsigned ImmIdx = ~0U;
411 switch (Opcode) {
412 default:
413 return TTI::TCC_Free;
414 case Instruction::GetElementPtr:
415 // Always hoist the base address of a GetElementPtr.
416 if (Idx == 0)
417 return 2 * TTI::TCC_Basic;
418 return TTI::TCC_Free;
419 case Instruction::Store:
420 ImmIdx = 0;
421 break;
422 case Instruction::Add:
423 case Instruction::Sub:
424 case Instruction::Mul:
425 case Instruction::UDiv:
426 case Instruction::SDiv:
427 case Instruction::URem:
428 case Instruction::SRem:
429 case Instruction::And:
430 case Instruction::Or:
431 case Instruction::Xor:
432 case Instruction::ICmp:
433 ImmIdx = 1;
434 break;
435 // Always return TCC_Free for the shift value of a shift instruction.
436 case Instruction::Shl:
437 case Instruction::LShr:
438 case Instruction::AShr:
439 if (Idx == 1)
440 return TTI::TCC_Free;
441 break;
442 case Instruction::Trunc:
443 case Instruction::ZExt:
444 case Instruction::SExt:
445 case Instruction::IntToPtr:
446 case Instruction::PtrToInt:
447 case Instruction::BitCast:
448 case Instruction::PHI:
449 case Instruction::Call:
450 case Instruction::Select:
451 case Instruction::Ret:
452 case Instruction::Load:
453 break;
454 }
455
456 if (Idx == ImmIdx) {
457 int NumConstants = (BitSize + 63) / 64;
459 return (Cost <= NumConstants * TTI::TCC_Basic)
460 ? static_cast<int>(TTI::TCC_Free)
461 : Cost;
462 }
464}
465
468 const APInt &Imm, Type *Ty,
470 assert(Ty->isIntegerTy());
471
472 unsigned BitSize = Ty->getPrimitiveSizeInBits();
473 // There is no cost model for constants with a bit size of 0. Return TCC_Free
474 // here, so that constant hoisting will ignore this constant.
475 if (BitSize == 0)
476 return TTI::TCC_Free;
477
478 // Most (all?) AArch64 intrinsics do not support folding immediates into the
479 // selected instruction, so we compute the materialization cost for the
480 // immediate directly.
481 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
483
484 switch (IID) {
485 default:
486 return TTI::TCC_Free;
487 case Intrinsic::sadd_with_overflow:
488 case Intrinsic::uadd_with_overflow:
489 case Intrinsic::ssub_with_overflow:
490 case Intrinsic::usub_with_overflow:
491 case Intrinsic::smul_with_overflow:
492 case Intrinsic::umul_with_overflow:
493 if (Idx == 1) {
494 int NumConstants = (BitSize + 63) / 64;
496 return (Cost <= NumConstants * TTI::TCC_Basic)
497 ? static_cast<int>(TTI::TCC_Free)
498 : Cost;
499 }
500 break;
501 case Intrinsic::experimental_stackmap:
502 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
503 return TTI::TCC_Free;
504 break;
505 case Intrinsic::experimental_patchpoint_void:
506 case Intrinsic::experimental_patchpoint:
507 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
508 return TTI::TCC_Free;
509 break;
510 case Intrinsic::experimental_gc_statepoint:
511 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
512 return TTI::TCC_Free;
513 break;
514 }
516}
517
520 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
521 if (TyWidth == 32 || TyWidth == 64)
523 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
524 return TTI::PSK_Software;
525}
526
527static bool isUnpackedVectorVT(EVT VecVT) {
528 return VecVT.isScalableVector() &&
530}
531
533 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
534 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
535 unsigned TotalHistCnts = 1;
536
537 unsigned EltSize = EltTy->getScalarSizeInBits();
538 // Only allow (up to 64b) integers or pointers
539 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
541
542 // FIXME: We should be able to generate histcnt for fixed-length vectors
543 // using ptrue with a specific VL.
544 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
545 unsigned EC = VTy->getElementCount().getKnownMinValue();
546 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
548
549 // HistCnt only supports 32b and 64b element types
550 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
551
552 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
554
555 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
556 TotalHistCnts = EC / NaturalVectorWidth;
557 }
558
559 return InstructionCost(BaseHistCntCost * TotalHistCnts);
560}
561
565 // The code-generator is currently not able to handle scalable vectors
566 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
567 // it. This change will be removed when code-generation for these types is
568 // sufficiently reliable.
569 auto *RetTy = ICA.getReturnType();
570 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
571 if (VTy->getElementCount() == ElementCount::getScalable(1))
573
574 switch (ICA.getID()) {
575 case Intrinsic::experimental_vector_histogram_add:
576 if (!ST->hasSVE2())
578 return getHistogramCost(ICA);
579 case Intrinsic::umin:
580 case Intrinsic::umax:
581 case Intrinsic::smin:
582 case Intrinsic::smax: {
583 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
584 MVT::v8i16, MVT::v2i32, MVT::v4i32,
585 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
586 MVT::nxv2i64};
588 // v2i64 types get converted to cmp+bif hence the cost of 2
589 if (LT.second == MVT::v2i64)
590 return LT.first * 2;
591 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
592 return LT.first;
593 break;
594 }
595 case Intrinsic::sadd_sat:
596 case Intrinsic::ssub_sat:
597 case Intrinsic::uadd_sat:
598 case Intrinsic::usub_sat: {
599 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
600 MVT::v8i16, MVT::v2i32, MVT::v4i32,
601 MVT::v2i64};
603 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
604 // need to extend the type, as it uses shr(qadd(shl, shl)).
605 unsigned Instrs =
606 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
607 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
608 return LT.first * Instrs;
609 break;
610 }
611 case Intrinsic::abs: {
612 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
613 MVT::v8i16, MVT::v2i32, MVT::v4i32,
614 MVT::v2i64};
616 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
617 return LT.first;
618 break;
619 }
620 case Intrinsic::bswap: {
621 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
622 MVT::v4i32, MVT::v2i64};
624 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
625 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
626 return LT.first;
627 break;
628 }
629 case Intrinsic::stepvector: {
630 InstructionCost Cost = 1; // Cost of the `index' instruction
632 // Legalisation of illegal vectors involves an `index' instruction plus
633 // (LT.first - 1) vector adds.
634 if (LT.first > 1) {
635 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
636 InstructionCost AddCost =
637 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
638 Cost += AddCost * (LT.first - 1);
639 }
640 return Cost;
641 }
642 case Intrinsic::vector_extract:
643 case Intrinsic::vector_insert: {
644 // If both the vector and subvector types are legal types and the index
645 // is 0, then this should be a no-op or simple operation; return a
646 // relatively low cost.
647
648 // If arguments aren't actually supplied, then we cannot determine the
649 // value of the index. We also want to skip predicate types.
650 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
652 break;
653
654 LLVMContext &C = RetTy->getContext();
655 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
656 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
657 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
658 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
659 // Skip this if either the vector or subvector types are unpacked
660 // SVE types; they may get lowered to stack stores and loads.
661 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
662 break;
663
665 getTLI()->getTypeConversion(C, SubVecVT);
667 getTLI()->getTypeConversion(C, VecVT);
668 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
669 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
670 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
671 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
672 return TTI::TCC_Free;
673 break;
674 }
675 case Intrinsic::bitreverse: {
676 static const CostTblEntry BitreverseTbl[] = {
677 {Intrinsic::bitreverse, MVT::i32, 1},
678 {Intrinsic::bitreverse, MVT::i64, 1},
679 {Intrinsic::bitreverse, MVT::v8i8, 1},
680 {Intrinsic::bitreverse, MVT::v16i8, 1},
681 {Intrinsic::bitreverse, MVT::v4i16, 2},
682 {Intrinsic::bitreverse, MVT::v8i16, 2},
683 {Intrinsic::bitreverse, MVT::v2i32, 2},
684 {Intrinsic::bitreverse, MVT::v4i32, 2},
685 {Intrinsic::bitreverse, MVT::v1i64, 2},
686 {Intrinsic::bitreverse, MVT::v2i64, 2},
687 };
688 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
689 const auto *Entry =
690 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
691 if (Entry) {
692 // Cost Model is using the legal type(i32) that i8 and i16 will be
693 // converted to +1 so that we match the actual lowering cost
694 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
695 TLI->getValueType(DL, RetTy, true) == MVT::i16)
696 return LegalisationCost.first * Entry->Cost + 1;
697
698 return LegalisationCost.first * Entry->Cost;
699 }
700 break;
701 }
702 case Intrinsic::ctpop: {
703 if (!ST->hasNEON()) {
704 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
705 return getTypeLegalizationCost(RetTy).first * 12;
706 }
707 static const CostTblEntry CtpopCostTbl[] = {
708 {ISD::CTPOP, MVT::v2i64, 4},
709 {ISD::CTPOP, MVT::v4i32, 3},
710 {ISD::CTPOP, MVT::v8i16, 2},
711 {ISD::CTPOP, MVT::v16i8, 1},
712 {ISD::CTPOP, MVT::i64, 4},
713 {ISD::CTPOP, MVT::v2i32, 3},
714 {ISD::CTPOP, MVT::v4i16, 2},
715 {ISD::CTPOP, MVT::v8i8, 1},
716 {ISD::CTPOP, MVT::i32, 5},
717 };
719 MVT MTy = LT.second;
720 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
721 // Extra cost of +1 when illegal vector types are legalized by promoting
722 // the integer type.
723 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
724 RetTy->getScalarSizeInBits()
725 ? 1
726 : 0;
727 return LT.first * Entry->Cost + ExtraCost;
728 }
729 break;
730 }
731 case Intrinsic::sadd_with_overflow:
732 case Intrinsic::uadd_with_overflow:
733 case Intrinsic::ssub_with_overflow:
734 case Intrinsic::usub_with_overflow:
735 case Intrinsic::smul_with_overflow:
736 case Intrinsic::umul_with_overflow: {
737 static const CostTblEntry WithOverflowCostTbl[] = {
738 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
739 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
740 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
741 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
742 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
743 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
744 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
745 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
746 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
747 {Intrinsic::usub_with_overflow, MVT::i8, 3},
748 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
749 {Intrinsic::usub_with_overflow, MVT::i16, 3},
750 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
751 {Intrinsic::usub_with_overflow, MVT::i32, 1},
752 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
753 {Intrinsic::usub_with_overflow, MVT::i64, 1},
754 {Intrinsic::smul_with_overflow, MVT::i8, 5},
755 {Intrinsic::umul_with_overflow, MVT::i8, 4},
756 {Intrinsic::smul_with_overflow, MVT::i16, 5},
757 {Intrinsic::umul_with_overflow, MVT::i16, 4},
758 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
759 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
760 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
761 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
762 };
763 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
764 if (MTy.isSimple())
765 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
766 MTy.getSimpleVT()))
767 return Entry->Cost;
768 break;
769 }
770 case Intrinsic::fptosi_sat:
771 case Intrinsic::fptoui_sat: {
772 if (ICA.getArgTypes().empty())
773 break;
774 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
775 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
776 EVT MTy = TLI->getValueType(DL, RetTy);
777 // Check for the legal types, which are where the size of the input and the
778 // output are the same, or we are using cvt f64->i32 or f32->i64.
779 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
780 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
781 LT.second == MVT::v2f64)) {
782 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
783 (LT.second == MVT::f64 && MTy == MVT::i32) ||
784 (LT.second == MVT::f32 && MTy == MVT::i64)))
785 return LT.first;
786 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
787 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
788 MTy.getScalarSizeInBits() == 64)
789 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
790 }
791 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
792 // f32.
793 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
794 return LT.first + getIntrinsicInstrCost(
795 {ICA.getID(),
796 RetTy,
797 {ICA.getArgTypes()[0]->getWithNewType(
798 Type::getFloatTy(RetTy->getContext()))}},
799 CostKind);
800 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
801 (LT.second == MVT::f16 && MTy == MVT::i64) ||
802 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
803 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
804 return LT.first;
805 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
806 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
807 MTy.getScalarSizeInBits() == 32)
808 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
809 // Extending vector types v8f16->v8i32. These current scalarize but the
810 // codegen could be better.
811 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
812 MTy.getScalarSizeInBits() == 64)
813 return MTy.getVectorNumElements() * 3;
814
815 // If we can we use a legal convert followed by a min+max
816 if ((LT.second.getScalarType() == MVT::f32 ||
817 LT.second.getScalarType() == MVT::f64 ||
818 LT.second.getScalarType() == MVT::f16) &&
819 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
820 Type *LegalTy =
821 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
822 if (LT.second.isVector())
823 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
825 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
826 LegalTy, {LegalTy, LegalTy});
828 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
829 LegalTy, {LegalTy, LegalTy});
831 return LT.first * Cost +
832 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
833 : 1);
834 }
835 // Otherwise we need to follow the default expansion that clamps the value
836 // using a float min/max with a fcmp+sel for nan handling when signed.
837 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
838 RetTy = RetTy->getScalarType();
839 if (LT.second.isVector()) {
840 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
841 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
842 }
843 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
845 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
847 Cost +=
848 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
850 if (IsSigned) {
851 Type *CondTy = RetTy->getWithNewBitWidth(1);
852 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
854 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
856 }
857 return LT.first * Cost;
858 }
859 case Intrinsic::fshl:
860 case Intrinsic::fshr: {
861 if (ICA.getArgs().empty())
862 break;
863
864 // TODO: Add handling for fshl where third argument is not a constant.
865 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
866 if (!OpInfoZ.isConstant())
867 break;
868
869 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
870 if (OpInfoZ.isUniform()) {
871 // FIXME: The costs could be lower if the codegen is better.
872 static const CostTblEntry FshlTbl[] = {
873 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
874 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
875 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
876 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
877 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
878 // to avoid having to duplicate the costs.
879 const auto *Entry =
880 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
881 if (Entry)
882 return LegalisationCost.first * Entry->Cost;
883 }
884
885 auto TyL = getTypeLegalizationCost(RetTy);
886 if (!RetTy->isIntegerTy())
887 break;
888
889 // Estimate cost manually, as types like i8 and i16 will get promoted to
890 // i32 and CostTableLookup will ignore the extra conversion cost.
891 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
892 RetTy->getScalarSizeInBits() < 64) ||
893 (RetTy->getScalarSizeInBits() % 64 != 0);
894 unsigned ExtraCost = HigherCost ? 1 : 0;
895 if (RetTy->getScalarSizeInBits() == 32 ||
896 RetTy->getScalarSizeInBits() == 64)
897 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
898 // extr instruction.
899 else if (HigherCost)
900 ExtraCost = 1;
901 else
902 break;
903 return TyL.first + ExtraCost;
904 }
905 case Intrinsic::get_active_lane_mask: {
906 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
907 if (RetTy) {
908 EVT RetVT = getTLI()->getValueType(DL, RetTy);
909 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
910 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
911 !getTLI()->isTypeLegal(RetVT)) {
912 // We don't have enough context at this point to determine if the mask
913 // is going to be kept live after the block, which will force the vXi1
914 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
915 // For now, we just assume the vectorizer created this intrinsic and
916 // the result will be the input for a PHI. In this case the cost will
917 // be extremely high for fixed-width vectors.
918 // NOTE: getScalarizationOverhead returns a cost that's far too
919 // pessimistic for the actual generated codegen. In reality there are
920 // two instructions generated per lane.
921 return RetTy->getNumElements() * 2;
922 }
923 }
924 break;
925 }
926 case Intrinsic::experimental_vector_match: {
927 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
928 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
929 unsigned SearchSize = NeedleTy->getNumElements();
930 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
931 // Base cost for MATCH instructions. At least on the Neoverse V2 and
932 // Neoverse V3, these are cheap operations with the same latency as a
933 // vector ADD. In most cases, however, we also need to do an extra DUP.
934 // For fixed-length vectors we currently need an extra five--six
935 // instructions besides the MATCH.
937 if (isa<FixedVectorType>(RetTy))
938 Cost += 10;
939 return Cost;
940 }
941 break;
942 }
943 default:
944 break;
945 }
947}
948
949/// The function will remove redundant reinterprets casting in the presence
950/// of the control flow
951static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
952 IntrinsicInst &II) {
954 auto RequiredType = II.getType();
955
956 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
957 assert(PN && "Expected Phi Node!");
958
959 // Don't create a new Phi unless we can remove the old one.
960 if (!PN->hasOneUse())
961 return std::nullopt;
962
963 for (Value *IncValPhi : PN->incoming_values()) {
964 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
965 if (!Reinterpret ||
966 Reinterpret->getIntrinsicID() !=
967 Intrinsic::aarch64_sve_convert_to_svbool ||
968 RequiredType != Reinterpret->getArgOperand(0)->getType())
969 return std::nullopt;
970 }
971
972 // Create the new Phi
973 IC.Builder.SetInsertPoint(PN);
974 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
975 Worklist.push_back(PN);
976
977 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
978 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
979 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
980 Worklist.push_back(Reinterpret);
981 }
982
983 // Cleanup Phi Node and reinterprets
984 return IC.replaceInstUsesWith(II, NPN);
985}
986
987// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
988// => (binop (pred) (from_svbool _) (from_svbool _))
989//
990// The above transformation eliminates a `to_svbool` in the predicate
991// operand of bitwise operation `binop` by narrowing the vector width of
992// the operation. For example, it would convert a `<vscale x 16 x i1>
993// and` into a `<vscale x 4 x i1> and`. This is profitable because
994// to_svbool must zero the new lanes during widening, whereas
995// from_svbool is free.
996static std::optional<Instruction *>
998 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
999 if (!BinOp)
1000 return std::nullopt;
1001
1002 auto IntrinsicID = BinOp->getIntrinsicID();
1003 switch (IntrinsicID) {
1004 case Intrinsic::aarch64_sve_and_z:
1005 case Intrinsic::aarch64_sve_bic_z:
1006 case Intrinsic::aarch64_sve_eor_z:
1007 case Intrinsic::aarch64_sve_nand_z:
1008 case Intrinsic::aarch64_sve_nor_z:
1009 case Intrinsic::aarch64_sve_orn_z:
1010 case Intrinsic::aarch64_sve_orr_z:
1011 break;
1012 default:
1013 return std::nullopt;
1014 }
1015
1016 auto BinOpPred = BinOp->getOperand(0);
1017 auto BinOpOp1 = BinOp->getOperand(1);
1018 auto BinOpOp2 = BinOp->getOperand(2);
1019
1020 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1021 if (!PredIntr ||
1022 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1023 return std::nullopt;
1024
1025 auto PredOp = PredIntr->getOperand(0);
1026 auto PredOpTy = cast<VectorType>(PredOp->getType());
1027 if (PredOpTy != II.getType())
1028 return std::nullopt;
1029
1030 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1031 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1032 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1033 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1034 if (BinOpOp1 == BinOpOp2)
1035 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1036 else
1037 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1038 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1039
1040 auto NarrowedBinOp =
1041 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1042 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1043}
1044
1045static std::optional<Instruction *>
1047 // If the reinterpret instruction operand is a PHI Node
1048 if (isa<PHINode>(II.getArgOperand(0)))
1049 return processPhiNode(IC, II);
1050
1051 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1052 return BinOpCombine;
1053
1054 // Ignore converts to/from svcount_t.
1055 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1056 isa<TargetExtType>(II.getType()))
1057 return std::nullopt;
1058
1059 SmallVector<Instruction *, 32> CandidatesForRemoval;
1060 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1061
1062 const auto *IVTy = cast<VectorType>(II.getType());
1063
1064 // Walk the chain of conversions.
1065 while (Cursor) {
1066 // If the type of the cursor has fewer lanes than the final result, zeroing
1067 // must take place, which breaks the equivalence chain.
1068 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1069 if (CursorVTy->getElementCount().getKnownMinValue() <
1070 IVTy->getElementCount().getKnownMinValue())
1071 break;
1072
1073 // If the cursor has the same type as I, it is a viable replacement.
1074 if (Cursor->getType() == IVTy)
1075 EarliestReplacement = Cursor;
1076
1077 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1078
1079 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1080 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1081 Intrinsic::aarch64_sve_convert_to_svbool ||
1082 IntrinsicCursor->getIntrinsicID() ==
1083 Intrinsic::aarch64_sve_convert_from_svbool))
1084 break;
1085
1086 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1087 Cursor = IntrinsicCursor->getOperand(0);
1088 }
1089
1090 // If no viable replacement in the conversion chain was found, there is
1091 // nothing to do.
1092 if (!EarliestReplacement)
1093 return std::nullopt;
1094
1095 return IC.replaceInstUsesWith(II, EarliestReplacement);
1096}
1097
1098static bool isAllActivePredicate(Value *Pred) {
1099 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1100 Value *UncastedPred;
1101 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1102 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1103 m_Value(UncastedPred)))))
1104 // If the predicate has the same or less lanes than the uncasted
1105 // predicate then we know the casting has no effect.
1106 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1107 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1108 Pred = UncastedPred;
1109
1110 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1111 m_ConstantInt<AArch64SVEPredPattern::all>()));
1112}
1113
1114// Simplify unary operation where predicate has all inactive lanes by replacing
1115// instruction with its operand
1116static std::optional<Instruction *>
1118 bool hasInactiveVector) {
1119 int PredOperand = hasInactiveVector ? 1 : 0;
1120 int ReplaceOperand = hasInactiveVector ? 0 : 1;
1121 if (match(II.getOperand(PredOperand), m_ZeroInt())) {
1122 IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
1123 return IC.eraseInstFromFunction(II);
1124 }
1125 return std::nullopt;
1126}
1127
1128// Simplify unary operation where predicate has all inactive lanes or
1129// replace unused first operand with undef when all lanes are active
1130static std::optional<Instruction *>
1132 if (isAllActivePredicate(II.getOperand(1)) &&
1133 !isa<llvm::UndefValue>(II.getOperand(0)) &&
1134 !isa<llvm::PoisonValue>(II.getOperand(0))) {
1135 Value *Undef = llvm::UndefValue::get(II.getType());
1136 return IC.replaceOperand(II, 0, Undef);
1137 }
1138 return instCombineSVENoActiveReplace(IC, II, true);
1139}
1140
1141// Erase unary operation where predicate has all inactive lanes
1142static std::optional<Instruction *>
1144 int PredPos) {
1145 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1146 return IC.eraseInstFromFunction(II);
1147 }
1148 return std::nullopt;
1149}
1150
1151// Simplify operation where predicate has all inactive lanes by replacing
1152// instruction with zeroed object
1153static std::optional<Instruction *>
1155 if (match(II.getOperand(0), m_ZeroInt())) {
1156 Constant *Node;
1157 Type *RetTy = II.getType();
1158 if (RetTy->isStructTy()) {
1159 auto StructT = cast<StructType>(RetTy);
1160 auto VecT = StructT->getElementType(0);
1162 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1163 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1164 : ConstantInt::get(VecT, 0));
1165 }
1166 Node = ConstantStruct::get(StructT, ZerVec);
1167 } else
1168 Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1169 : ConstantInt::get(II.getType(), 0);
1170
1172 return IC.eraseInstFromFunction(II);
1173 }
1174 return std::nullopt;
1175}
1176
1177static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1178 IntrinsicInst &II) {
1179 // svsel(ptrue, x, y) => x
1180 auto *OpPredicate = II.getOperand(0);
1181 if (isAllActivePredicate(OpPredicate))
1182 return IC.replaceInstUsesWith(II, II.getOperand(1));
1183
1184 auto Select =
1185 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1186 return IC.replaceInstUsesWith(II, Select);
1187}
1188
1189static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1190 IntrinsicInst &II) {
1191 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1192 if (!Pg)
1193 return std::nullopt;
1194
1195 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1196 return std::nullopt;
1197
1198 const auto PTruePattern =
1199 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1200 if (PTruePattern != AArch64SVEPredPattern::vl1)
1201 return std::nullopt;
1202
1203 // The intrinsic is inserting into lane zero so use an insert instead.
1204 auto *IdxTy = Type::getInt64Ty(II.getContext());
1205 auto *Insert = InsertElementInst::Create(
1206 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1207 Insert->insertBefore(&II);
1208 Insert->takeName(&II);
1209
1210 return IC.replaceInstUsesWith(II, Insert);
1211}
1212
1213static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1214 IntrinsicInst &II) {
1215 // Replace DupX with a regular IR splat.
1216 auto *RetTy = cast<ScalableVectorType>(II.getType());
1217 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1218 II.getArgOperand(0));
1219 Splat->takeName(&II);
1220 return IC.replaceInstUsesWith(II, Splat);
1221}
1222
1223static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1224 IntrinsicInst &II) {
1225 LLVMContext &Ctx = II.getContext();
1226
1227 // Replace by zero constant when all lanes are inactive
1228 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1229 return II_NA;
1230
1231 // Check that the predicate is all active
1232 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1233 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1234 return std::nullopt;
1235
1236 const auto PTruePattern =
1237 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1238 if (PTruePattern != AArch64SVEPredPattern::all)
1239 return std::nullopt;
1240
1241 // Check that we have a compare of zero..
1242 auto *SplatValue =
1243 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1244 if (!SplatValue || !SplatValue->isZero())
1245 return std::nullopt;
1246
1247 // ..against a dupq
1248 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1249 if (!DupQLane ||
1250 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1251 return std::nullopt;
1252
1253 // Where the dupq is a lane 0 replicate of a vector insert
1254 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1255 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1256 return std::nullopt;
1257
1258 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1259 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1260 return std::nullopt;
1261
1262 // Where the vector insert is a fixed constant vector insert into undef at
1263 // index zero
1264 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1265 return std::nullopt;
1266
1267 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1268 return std::nullopt;
1269
1270 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1271 if (!ConstVec)
1272 return std::nullopt;
1273
1274 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1275 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1276 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1277 return std::nullopt;
1278
1279 unsigned NumElts = VecTy->getNumElements();
1280 unsigned PredicateBits = 0;
1281
1282 // Expand intrinsic operands to a 16-bit byte level predicate
1283 for (unsigned I = 0; I < NumElts; ++I) {
1284 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1285 if (!Arg)
1286 return std::nullopt;
1287 if (!Arg->isZero())
1288 PredicateBits |= 1 << (I * (16 / NumElts));
1289 }
1290
1291 // If all bits are zero bail early with an empty predicate
1292 if (PredicateBits == 0) {
1293 auto *PFalse = Constant::getNullValue(II.getType());
1294 PFalse->takeName(&II);
1295 return IC.replaceInstUsesWith(II, PFalse);
1296 }
1297
1298 // Calculate largest predicate type used (where byte predicate is largest)
1299 unsigned Mask = 8;
1300 for (unsigned I = 0; I < 16; ++I)
1301 if ((PredicateBits & (1 << I)) != 0)
1302 Mask |= (I % 8);
1303
1304 unsigned PredSize = Mask & -Mask;
1305 auto *PredType = ScalableVectorType::get(
1306 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1307
1308 // Ensure all relevant bits are set
1309 for (unsigned I = 0; I < 16; I += PredSize)
1310 if ((PredicateBits & (1 << I)) == 0)
1311 return std::nullopt;
1312
1313 auto *PTruePat =
1314 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1315 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1316 {PredType}, {PTruePat});
1317 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1318 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1319 auto *ConvertFromSVBool =
1320 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1321 {II.getType()}, {ConvertToSVBool});
1322
1323 ConvertFromSVBool->takeName(&II);
1324 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1325}
1326
1327static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1328 IntrinsicInst &II) {
1329 Value *Pg = II.getArgOperand(0);
1330 Value *Vec = II.getArgOperand(1);
1331 auto IntrinsicID = II.getIntrinsicID();
1332 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1333
1334 // lastX(splat(X)) --> X
1335 if (auto *SplatVal = getSplatValue(Vec))
1336 return IC.replaceInstUsesWith(II, SplatVal);
1337
1338 // If x and/or y is a splat value then:
1339 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1340 Value *LHS, *RHS;
1341 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1342 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1343 auto *OldBinOp = cast<BinaryOperator>(Vec);
1344 auto OpC = OldBinOp->getOpcode();
1345 auto *NewLHS =
1346 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1347 auto *NewRHS =
1348 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1350 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1351 return IC.replaceInstUsesWith(II, NewBinOp);
1352 }
1353 }
1354
1355 auto *C = dyn_cast<Constant>(Pg);
1356 if (IsAfter && C && C->isNullValue()) {
1357 // The intrinsic is extracting lane 0 so use an extract instead.
1358 auto *IdxTy = Type::getInt64Ty(II.getContext());
1359 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1360 Extract->insertBefore(&II);
1361 Extract->takeName(&II);
1362 return IC.replaceInstUsesWith(II, Extract);
1363 }
1364
1365 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1366 if (!IntrPG)
1367 return std::nullopt;
1368
1369 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1370 return std::nullopt;
1371
1372 const auto PTruePattern =
1373 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1374
1375 // Can the intrinsic's predicate be converted to a known constant index?
1376 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1377 if (!MinNumElts)
1378 return std::nullopt;
1379
1380 unsigned Idx = MinNumElts - 1;
1381 // Increment the index if extracting the element after the last active
1382 // predicate element.
1383 if (IsAfter)
1384 ++Idx;
1385
1386 // Ignore extracts whose index is larger than the known minimum vector
1387 // length. NOTE: This is an artificial constraint where we prefer to
1388 // maintain what the user asked for until an alternative is proven faster.
1389 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1390 if (Idx >= PgVTy->getMinNumElements())
1391 return std::nullopt;
1392
1393 // The intrinsic is extracting a fixed lane so use an extract instead.
1394 auto *IdxTy = Type::getInt64Ty(II.getContext());
1395 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1396 Extract->insertBefore(&II);
1397 Extract->takeName(&II);
1398 return IC.replaceInstUsesWith(II, Extract);
1399}
1400
1401static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1402 IntrinsicInst &II) {
1403 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1404 // integer variant across a variety of micro-architectures. Replace scalar
1405 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1406 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1407 // depending on the micro-architecture, but has been observed as generally
1408 // being faster, particularly when the CLAST[AB] op is a loop-carried
1409 // dependency.
1410 Value *Pg = II.getArgOperand(0);
1411 Value *Fallback = II.getArgOperand(1);
1412 Value *Vec = II.getArgOperand(2);
1413 Type *Ty = II.getType();
1414
1415 if (!Ty->isIntegerTy())
1416 return std::nullopt;
1417
1418 Type *FPTy;
1419 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1420 default:
1421 return std::nullopt;
1422 case 16:
1423 FPTy = IC.Builder.getHalfTy();
1424 break;
1425 case 32:
1426 FPTy = IC.Builder.getFloatTy();
1427 break;
1428 case 64:
1429 FPTy = IC.Builder.getDoubleTy();
1430 break;
1431 }
1432
1433 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1434 auto *FPVTy = VectorType::get(
1435 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1436 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1437 auto *FPII = IC.Builder.CreateIntrinsic(
1438 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1439 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1440 return IC.replaceInstUsesWith(II, FPIItoInt);
1441}
1442
1443static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1444 IntrinsicInst &II) {
1445 LLVMContext &Ctx = II.getContext();
1446 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1447 // can work with RDFFR_PP for ptest elimination.
1448 auto *AllPat =
1449 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1450 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1451 {II.getType()}, {AllPat});
1452 auto *RDFFR =
1453 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1454 RDFFR->takeName(&II);
1455 return IC.replaceInstUsesWith(II, RDFFR);
1456}
1457
1458static std::optional<Instruction *>
1460 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1461
1462 if (Pattern == AArch64SVEPredPattern::all) {
1463 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1464 auto *VScale = IC.Builder.CreateVScale(StepVal);
1465 VScale->takeName(&II);
1466 return IC.replaceInstUsesWith(II, VScale);
1467 }
1468
1469 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1470
1471 return MinNumElts && NumElts >= MinNumElts
1472 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1473 II, ConstantInt::get(II.getType(), MinNumElts)))
1474 : std::nullopt;
1475}
1476
1477static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1478 IntrinsicInst &II) {
1479 Value *PgVal = II.getArgOperand(0);
1480 Value *OpVal = II.getArgOperand(1);
1481
1482 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1483 // Later optimizations prefer this form.
1484 if (PgVal == OpVal &&
1485 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1486 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1487 Value *Ops[] = {PgVal, OpVal};
1488 Type *Tys[] = {PgVal->getType()};
1489
1490 auto *PTest =
1491 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1492 PTest->takeName(&II);
1493
1494 return IC.replaceInstUsesWith(II, PTest);
1495 }
1496
1497 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1498 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1499
1500 if (!Pg || !Op)
1501 return std::nullopt;
1502
1503 Intrinsic::ID OpIID = Op->getIntrinsicID();
1504
1505 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1506 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1507 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1508 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1509 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1510
1511 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1512
1513 PTest->takeName(&II);
1514 return IC.replaceInstUsesWith(II, PTest);
1515 }
1516
1517 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1518 // Later optimizations may rewrite sequence to use the flag-setting variant
1519 // of instruction X to remove PTEST.
1520 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1521 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1522 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1523 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1524 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1525 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1526 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1527 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1528 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1529 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1530 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1531 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1532 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1533 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1534 Type *Tys[] = {Pg->getType()};
1535
1536 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1537 PTest->takeName(&II);
1538
1539 return IC.replaceInstUsesWith(II, PTest);
1540 }
1541
1542 return std::nullopt;
1543}
1544
1545template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1546static std::optional<Instruction *>
1548 bool MergeIntoAddendOp) {
1549 Value *P = II.getOperand(0);
1550 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1551 if (MergeIntoAddendOp) {
1552 AddendOp = II.getOperand(1);
1553 Mul = II.getOperand(2);
1554 } else {
1555 AddendOp = II.getOperand(2);
1556 Mul = II.getOperand(1);
1557 }
1558
1559 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1560 m_Value(MulOp1))))
1561 return std::nullopt;
1562
1563 if (!Mul->hasOneUse())
1564 return std::nullopt;
1565
1566 Instruction *FMFSource = nullptr;
1567 if (II.getType()->isFPOrFPVectorTy()) {
1568 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1569 // Stop the combine when the flags on the inputs differ in case dropping
1570 // flags would lead to us missing out on more beneficial optimizations.
1571 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1572 return std::nullopt;
1573 if (!FAddFlags.allowContract())
1574 return std::nullopt;
1575 FMFSource = &II;
1576 }
1577
1578 CallInst *Res;
1579 if (MergeIntoAddendOp)
1580 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1581 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1582 else
1583 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1584 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1585
1586 return IC.replaceInstUsesWith(II, Res);
1587}
1588
1589static std::optional<Instruction *>
1591 Value *Pred = II.getOperand(0);
1592 Value *PtrOp = II.getOperand(1);
1593 Type *VecTy = II.getType();
1594
1595 // Replace by zero constant when all lanes are inactive
1596 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1597 return II_NA;
1598
1599 if (isAllActivePredicate(Pred)) {
1600 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1601 Load->copyMetadata(II);
1602 return IC.replaceInstUsesWith(II, Load);
1603 }
1604
1605 CallInst *MaskedLoad =
1606 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1607 Pred, ConstantAggregateZero::get(VecTy));
1608 MaskedLoad->copyMetadata(II);
1609 return IC.replaceInstUsesWith(II, MaskedLoad);
1610}
1611
1612static std::optional<Instruction *>
1614 Value *VecOp = II.getOperand(0);
1615 Value *Pred = II.getOperand(1);
1616 Value *PtrOp = II.getOperand(2);
1617
1618 if (isAllActivePredicate(Pred)) {
1619 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1620 Store->copyMetadata(II);
1621 return IC.eraseInstFromFunction(II);
1622 }
1623
1624 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1625 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1626 MaskedStore->copyMetadata(II);
1627 return IC.eraseInstFromFunction(II);
1628}
1629
1631 switch (Intrinsic) {
1632 case Intrinsic::aarch64_sve_fmul_u:
1633 return Instruction::BinaryOps::FMul;
1634 case Intrinsic::aarch64_sve_fadd_u:
1635 return Instruction::BinaryOps::FAdd;
1636 case Intrinsic::aarch64_sve_fsub_u:
1637 return Instruction::BinaryOps::FSub;
1638 default:
1639 return Instruction::BinaryOpsEnd;
1640 }
1641}
1642
1643static std::optional<Instruction *>
1645 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1646 if (II.isStrictFP())
1647 return std::nullopt;
1648
1649 auto *OpPredicate = II.getOperand(0);
1650 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1651 if (BinOpCode == Instruction::BinaryOpsEnd ||
1652 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1653 m_ConstantInt<AArch64SVEPredPattern::all>())))
1654 return std::nullopt;
1655 auto BinOp = IC.Builder.CreateBinOpFMF(
1656 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
1657 return IC.replaceInstUsesWith(II, BinOp);
1658}
1659
1660// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1661// sve.add_u).
1662static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1663 Intrinsic::ID IID) {
1664 auto *OpPredicate = II.getOperand(0);
1665 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1666 m_ConstantInt<AArch64SVEPredPattern::all>())))
1667 return std::nullopt;
1668
1669 auto *Mod = II.getModule();
1670 auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
1671 II.setCalledFunction(NewDecl);
1672
1673 return &II;
1674}
1675
1676// Simplify operations where predicate has all inactive lanes or try to replace
1677// with _u form when all lanes are active
1678static std::optional<Instruction *>
1680 Intrinsic::ID IID) {
1681 if (match(II.getOperand(0), m_ZeroInt())) {
1682 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1683 // inactive for sv[func]_m
1684 return IC.replaceInstUsesWith(II, II.getOperand(1));
1685 }
1686 return instCombineSVEAllActive(II, IID);
1687}
1688
1689static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1690 IntrinsicInst &II) {
1691 if (auto II_U =
1692 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1693 return II_U;
1694 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1695 Intrinsic::aarch64_sve_mla>(
1696 IC, II, true))
1697 return MLA;
1698 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1699 Intrinsic::aarch64_sve_mad>(
1700 IC, II, false))
1701 return MAD;
1702 return std::nullopt;
1703}
1704
1705static std::optional<Instruction *>
1707 if (auto II_U =
1708 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1709 return II_U;
1710 if (auto FMLA =
1711 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1712 Intrinsic::aarch64_sve_fmla>(IC, II,
1713 true))
1714 return FMLA;
1715 if (auto FMAD =
1716 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1717 Intrinsic::aarch64_sve_fmad>(IC, II,
1718 false))
1719 return FMAD;
1720 if (auto FMLA =
1721 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1722 Intrinsic::aarch64_sve_fmla>(IC, II,
1723 true))
1724 return FMLA;
1725 return std::nullopt;
1726}
1727
1728static std::optional<Instruction *>
1730 if (auto FMLA =
1731 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1732 Intrinsic::aarch64_sve_fmla>(IC, II,
1733 true))
1734 return FMLA;
1735 if (auto FMAD =
1736 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1737 Intrinsic::aarch64_sve_fmad>(IC, II,
1738 false))
1739 return FMAD;
1740 if (auto FMLA_U =
1741 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1742 Intrinsic::aarch64_sve_fmla_u>(
1743 IC, II, true))
1744 return FMLA_U;
1745 return instCombineSVEVectorBinOp(IC, II);
1746}
1747
1748static std::optional<Instruction *>
1750 if (auto II_U =
1751 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1752 return II_U;
1753 if (auto FMLS =
1754 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1755 Intrinsic::aarch64_sve_fmls>(IC, II,
1756 true))
1757 return FMLS;
1758 if (auto FMSB =
1759 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1760 Intrinsic::aarch64_sve_fnmsb>(
1761 IC, II, false))
1762 return FMSB;
1763 if (auto FMLS =
1764 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1765 Intrinsic::aarch64_sve_fmls>(IC, II,
1766 true))
1767 return FMLS;
1768 return std::nullopt;
1769}
1770
1771static std::optional<Instruction *>
1773 if (auto FMLS =
1774 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1775 Intrinsic::aarch64_sve_fmls>(IC, II,
1776 true))
1777 return FMLS;
1778 if (auto FMSB =
1779 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1780 Intrinsic::aarch64_sve_fnmsb>(
1781 IC, II, false))
1782 return FMSB;
1783 if (auto FMLS_U =
1784 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1785 Intrinsic::aarch64_sve_fmls_u>(
1786 IC, II, true))
1787 return FMLS_U;
1788 return instCombineSVEVectorBinOp(IC, II);
1789}
1790
1791static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1792 IntrinsicInst &II) {
1793 if (auto II_U =
1794 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1795 return II_U;
1796 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1797 Intrinsic::aarch64_sve_mls>(
1798 IC, II, true))
1799 return MLS;
1800 return std::nullopt;
1801}
1802
1803static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1805 Intrinsic::ID IID) {
1806 auto *OpPredicate = II.getOperand(0);
1807 auto *OpMultiplicand = II.getOperand(1);
1808 auto *OpMultiplier = II.getOperand(2);
1809
1810 // Return true if a given instruction is a unit splat value, false otherwise.
1811 auto IsUnitSplat = [](auto *I) {
1812 auto *SplatValue = getSplatValue(I);
1813 if (!SplatValue)
1814 return false;
1815 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1816 };
1817
1818 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1819 // with a unit splat value, false otherwise.
1820 auto IsUnitDup = [](auto *I) {
1821 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1822 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1823 return false;
1824
1825 auto *SplatValue = IntrI->getOperand(2);
1826 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1827 };
1828
1829 if (IsUnitSplat(OpMultiplier)) {
1830 // [f]mul pg %n, (dupx 1) => %n
1831 OpMultiplicand->takeName(&II);
1832 return IC.replaceInstUsesWith(II, OpMultiplicand);
1833 } else if (IsUnitDup(OpMultiplier)) {
1834 // [f]mul pg %n, (dup pg 1) => %n
1835 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1836 auto *DupPg = DupInst->getOperand(1);
1837 // TODO: this is naive. The optimization is still valid if DupPg
1838 // 'encompasses' OpPredicate, not only if they're the same predicate.
1839 if (OpPredicate == DupPg) {
1840 OpMultiplicand->takeName(&II);
1841 return IC.replaceInstUsesWith(II, OpMultiplicand);
1842 }
1843 }
1844
1845 return instCombineSVEVectorBinOp(IC, II);
1846}
1847
1848static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1849 IntrinsicInst &II) {
1850 Value *UnpackArg = II.getArgOperand(0);
1851 auto *RetTy = cast<ScalableVectorType>(II.getType());
1852 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1853 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1854
1855 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1856 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1857 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1858 ScalarArg =
1859 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1860 Value *NewVal =
1861 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1862 NewVal->takeName(&II);
1863 return IC.replaceInstUsesWith(II, NewVal);
1864 }
1865
1866 return std::nullopt;
1867}
1868static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1869 IntrinsicInst &II) {
1870 auto *OpVal = II.getOperand(0);
1871 auto *OpIndices = II.getOperand(1);
1872 VectorType *VTy = cast<VectorType>(II.getType());
1873
1874 // Check whether OpIndices is a constant splat value < minimal element count
1875 // of result.
1876 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1877 if (!SplatValue ||
1878 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1879 return std::nullopt;
1880
1881 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1882 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1883 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1884 auto *VectorSplat =
1885 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1886
1887 VectorSplat->takeName(&II);
1888 return IC.replaceInstUsesWith(II, VectorSplat);
1889}
1890
1891static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1892 IntrinsicInst &II) {
1893 Value *A, *B;
1894 Type *RetTy = II.getType();
1895 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1896 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1897
1898 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1899 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1900 if ((match(II.getArgOperand(0),
1901 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1902 match(II.getArgOperand(1),
1903 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1904 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1905 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1906 auto *TyA = cast<ScalableVectorType>(A->getType());
1907 if (TyA == B->getType() &&
1909 auto *SubVec = IC.Builder.CreateInsertVector(
1911 auto *ConcatVec = IC.Builder.CreateInsertVector(
1912 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1913 ConcatVec->takeName(&II);
1914 return IC.replaceInstUsesWith(II, ConcatVec);
1915 }
1916 }
1917
1918 return std::nullopt;
1919}
1920
1921static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1922 IntrinsicInst &II) {
1923 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1924 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1925 Value *A, *B;
1926 if (match(II.getArgOperand(0),
1927 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1928 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1929 m_Specific(A), m_Specific(B))))
1930 return IC.replaceInstUsesWith(
1931 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1932
1933 return std::nullopt;
1934}
1935
1936static std::optional<Instruction *>
1938 Value *Mask = II.getOperand(0);
1939 Value *BasePtr = II.getOperand(1);
1940 Value *Index = II.getOperand(2);
1941 Type *Ty = II.getType();
1942 Value *PassThru = ConstantAggregateZero::get(Ty);
1943
1944 // Replace by zero constant when all lanes are inactive
1945 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1946 return II_NA;
1947
1948 // Contiguous gather => masked load.
1949 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1950 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1951 Value *IndexBase;
1952 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1953 m_Value(IndexBase), m_SpecificInt(1)))) {
1954 Align Alignment =
1955 BasePtr->getPointerAlignment(II.getDataLayout());
1956
1957 Type *VecPtrTy = PointerType::getUnqual(Ty);
1958 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1959 BasePtr, IndexBase);
1960 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1961 CallInst *MaskedLoad =
1962 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1963 MaskedLoad->takeName(&II);
1964 return IC.replaceInstUsesWith(II, MaskedLoad);
1965 }
1966
1967 return std::nullopt;
1968}
1969
1970static std::optional<Instruction *>
1972 Value *Val = II.getOperand(0);
1973 Value *Mask = II.getOperand(1);
1974 Value *BasePtr = II.getOperand(2);
1975 Value *Index = II.getOperand(3);
1976 Type *Ty = Val->getType();
1977
1978 // Contiguous scatter => masked store.
1979 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1980 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1981 Value *IndexBase;
1982 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1983 m_Value(IndexBase), m_SpecificInt(1)))) {
1984 Align Alignment =
1985 BasePtr->getPointerAlignment(II.getDataLayout());
1986
1987 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1988 BasePtr, IndexBase);
1989 Type *VecPtrTy = PointerType::getUnqual(Ty);
1990 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1991
1992 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1993
1994 return IC.eraseInstFromFunction(II);
1995 }
1996
1997 return std::nullopt;
1998}
1999
2000static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2001 IntrinsicInst &II) {
2002 Type *Int32Ty = IC.Builder.getInt32Ty();
2003 Value *Pred = II.getOperand(0);
2004 Value *Vec = II.getOperand(1);
2005 Value *DivVec = II.getOperand(2);
2006
2007 Value *SplatValue = getSplatValue(DivVec);
2008 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2009 if (!SplatConstantInt)
2010 return std::nullopt;
2011
2012 APInt Divisor = SplatConstantInt->getValue();
2013 const int64_t DivisorValue = Divisor.getSExtValue();
2014 if (DivisorValue == -1)
2015 return std::nullopt;
2016 if (DivisorValue == 1)
2017 IC.replaceInstUsesWith(II, Vec);
2018
2019 if (Divisor.isPowerOf2()) {
2020 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2021 auto ASRD = IC.Builder.CreateIntrinsic(
2022 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2023 return IC.replaceInstUsesWith(II, ASRD);
2024 }
2025 if (Divisor.isNegatedPowerOf2()) {
2026 Divisor.negate();
2027 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2028 auto ASRD = IC.Builder.CreateIntrinsic(
2029 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2030 auto NEG = IC.Builder.CreateIntrinsic(
2031 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2032 return IC.replaceInstUsesWith(II, NEG);
2033 }
2034
2035 return std::nullopt;
2036}
2037
2038bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2039 size_t VecSize = Vec.size();
2040 if (VecSize == 1)
2041 return true;
2042 if (!isPowerOf2_64(VecSize))
2043 return false;
2044 size_t HalfVecSize = VecSize / 2;
2045
2046 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2047 RHS != Vec.end(); LHS++, RHS++) {
2048 if (*LHS != nullptr && *RHS != nullptr) {
2049 if (*LHS == *RHS)
2050 continue;
2051 else
2052 return false;
2053 }
2054 if (!AllowPoison)
2055 return false;
2056 if (*LHS == nullptr && *RHS != nullptr)
2057 *LHS = *RHS;
2058 }
2059
2060 Vec.resize(HalfVecSize);
2061 SimplifyValuePattern(Vec, AllowPoison);
2062 return true;
2063}
2064
2065// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2066// to dupqlane(f64(C)) where C is A concatenated with B
2067static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2068 IntrinsicInst &II) {
2069 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2070 if (!match(II.getOperand(0),
2071 m_Intrinsic<Intrinsic::vector_insert>(
2072 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2073 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2074 return std::nullopt;
2075 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2076
2077 // Insert the scalars into a container ordered by InsertElement index
2078 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2079 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2080 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2081 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2082 CurrentInsertElt = InsertElt->getOperand(0);
2083 }
2084
2085 bool AllowPoison =
2086 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2087 if (!SimplifyValuePattern(Elts, AllowPoison))
2088 return std::nullopt;
2089
2090 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2091 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2092 for (size_t I = 0; I < Elts.size(); I++) {
2093 if (Elts[I] == nullptr)
2094 continue;
2095 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2096 IC.Builder.getInt64(I));
2097 }
2098 if (InsertEltChain == nullptr)
2099 return std::nullopt;
2100
2101 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2102 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2103 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2104 // be narrowed back to the original type.
2105 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2106 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2107 IIScalableTy->getMinNumElements() /
2108 PatternWidth;
2109
2110 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2111 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2112 auto *WideShuffleMaskTy =
2113 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2114
2115 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2116 auto InsertSubvector = IC.Builder.CreateInsertVector(
2117 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2118 auto WideBitcast =
2119 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2120 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2121 auto WideShuffle = IC.Builder.CreateShuffleVector(
2122 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2123 auto NarrowBitcast =
2124 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2125
2126 return IC.replaceInstUsesWith(II, NarrowBitcast);
2127}
2128
2129static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2130 IntrinsicInst &II) {
2131 Value *A = II.getArgOperand(0);
2132 Value *B = II.getArgOperand(1);
2133 if (A == B)
2134 return IC.replaceInstUsesWith(II, A);
2135
2136 return std::nullopt;
2137}
2138
2139static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2140 IntrinsicInst &II) {
2141 Value *Pred = II.getOperand(0);
2142 Value *Vec = II.getOperand(1);
2143 Value *Shift = II.getOperand(2);
2144
2145 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2146 Value *AbsPred, *MergedValue;
2147 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2148 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2149 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2150 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2151
2152 return std::nullopt;
2153
2154 // Transform is valid if any of the following are true:
2155 // * The ABS merge value is an undef or non-negative
2156 // * The ABS predicate is all active
2157 // * The ABS predicate and the SRSHL predicates are the same
2158 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2159 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2160 return std::nullopt;
2161
2162 // Only valid when the shift amount is non-negative, otherwise the rounding
2163 // behaviour of SRSHL cannot be ignored.
2164 if (!match(Shift, m_NonNegative()))
2165 return std::nullopt;
2166
2167 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2168 {II.getType()}, {Pred, Vec, Shift});
2169
2170 return IC.replaceInstUsesWith(II, LSL);
2171}
2172
2173static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2174 IntrinsicInst &II) {
2175 Value *Vec = II.getOperand(0);
2176
2177 if (getSplatValue(Vec) == II.getOperand(1))
2178 return IC.replaceInstUsesWith(II, Vec);
2179
2180 return std::nullopt;
2181}
2182
2183static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2184 IntrinsicInst &II) {
2185 // If this barrier is post-dominated by identical one we can remove it
2186 auto *NI = II.getNextNonDebugInstruction();
2187 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2188 auto CanSkipOver = [](Instruction *I) {
2189 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2190 };
2191 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2192 auto *NIBB = NI->getParent();
2193 NI = NI->getNextNonDebugInstruction();
2194 if (!NI) {
2195 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2196 NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
2197 else
2198 break;
2199 }
2200 }
2201 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2202 if (NextII && II.isIdenticalTo(NextII))
2203 return IC.eraseInstFromFunction(II);
2204
2205 return std::nullopt;
2206}
2207
2208std::optional<Instruction *>
2210 IntrinsicInst &II) const {
2211 Intrinsic::ID IID = II.getIntrinsicID();
2212 switch (IID) {
2213 default:
2214 break;
2215 case Intrinsic::aarch64_dmb:
2216 return instCombineDMB(IC, II);
2217 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2218 case Intrinsic::aarch64_sve_fcvt_f16f32:
2219 case Intrinsic::aarch64_sve_fcvt_f16f64:
2220 case Intrinsic::aarch64_sve_fcvt_f32f16:
2221 case Intrinsic::aarch64_sve_fcvt_f32f64:
2222 case Intrinsic::aarch64_sve_fcvt_f64f16:
2223 case Intrinsic::aarch64_sve_fcvt_f64f32:
2224 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2225 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2226 case Intrinsic::aarch64_sve_fcvtx_f32f64:
2227 case Intrinsic::aarch64_sve_fcvtzs:
2228 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2229 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2230 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2231 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2232 case Intrinsic::aarch64_sve_fcvtzu:
2233 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2234 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2235 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2236 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2237 case Intrinsic::aarch64_sve_scvtf:
2238 case Intrinsic::aarch64_sve_scvtf_f16i32:
2239 case Intrinsic::aarch64_sve_scvtf_f16i64:
2240 case Intrinsic::aarch64_sve_scvtf_f32i64:
2241 case Intrinsic::aarch64_sve_scvtf_f64i32:
2242 case Intrinsic::aarch64_sve_ucvtf:
2243 case Intrinsic::aarch64_sve_ucvtf_f16i32:
2244 case Intrinsic::aarch64_sve_ucvtf_f16i64:
2245 case Intrinsic::aarch64_sve_ucvtf_f32i64:
2246 case Intrinsic::aarch64_sve_ucvtf_f64i32:
2248 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2249 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2250 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2251 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2252 return instCombineSVENoActiveReplace(IC, II, true);
2253 case Intrinsic::aarch64_sve_st1_scatter:
2254 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2255 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2256 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2257 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2258 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2259 case Intrinsic::aarch64_sve_st1dq:
2260 case Intrinsic::aarch64_sve_st1q_scatter_index:
2261 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2262 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2263 case Intrinsic::aarch64_sve_st1wq:
2264 case Intrinsic::aarch64_sve_stnt1:
2265 case Intrinsic::aarch64_sve_stnt1_scatter:
2266 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2267 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2268 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2269 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2270 case Intrinsic::aarch64_sve_st2:
2271 case Intrinsic::aarch64_sve_st2q:
2272 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2273 case Intrinsic::aarch64_sve_st3:
2274 case Intrinsic::aarch64_sve_st3q:
2275 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2276 case Intrinsic::aarch64_sve_st4:
2277 case Intrinsic::aarch64_sve_st4q:
2278 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2279 case Intrinsic::aarch64_sve_addqv:
2280 case Intrinsic::aarch64_sve_and_z:
2281 case Intrinsic::aarch64_sve_bic_z:
2282 case Intrinsic::aarch64_sve_brka_z:
2283 case Intrinsic::aarch64_sve_brkb_z:
2284 case Intrinsic::aarch64_sve_brkn_z:
2285 case Intrinsic::aarch64_sve_brkpa_z:
2286 case Intrinsic::aarch64_sve_brkpb_z:
2287 case Intrinsic::aarch64_sve_cntp:
2288 case Intrinsic::aarch64_sve_compact:
2289 case Intrinsic::aarch64_sve_eor_z:
2290 case Intrinsic::aarch64_sve_eorv:
2291 case Intrinsic::aarch64_sve_eorqv:
2292 case Intrinsic::aarch64_sve_nand_z:
2293 case Intrinsic::aarch64_sve_nor_z:
2294 case Intrinsic::aarch64_sve_orn_z:
2295 case Intrinsic::aarch64_sve_orr_z:
2296 case Intrinsic::aarch64_sve_orv:
2297 case Intrinsic::aarch64_sve_orqv:
2298 case Intrinsic::aarch64_sve_pnext:
2299 case Intrinsic::aarch64_sve_rdffr_z:
2300 case Intrinsic::aarch64_sve_saddv:
2301 case Intrinsic::aarch64_sve_uaddv:
2302 case Intrinsic::aarch64_sve_umaxv:
2303 case Intrinsic::aarch64_sve_umaxqv:
2304 case Intrinsic::aarch64_sve_cmpeq:
2305 case Intrinsic::aarch64_sve_cmpeq_wide:
2306 case Intrinsic::aarch64_sve_cmpge:
2307 case Intrinsic::aarch64_sve_cmpge_wide:
2308 case Intrinsic::aarch64_sve_cmpgt:
2309 case Intrinsic::aarch64_sve_cmpgt_wide:
2310 case Intrinsic::aarch64_sve_cmphi:
2311 case Intrinsic::aarch64_sve_cmphi_wide:
2312 case Intrinsic::aarch64_sve_cmphs:
2313 case Intrinsic::aarch64_sve_cmphs_wide:
2314 case Intrinsic::aarch64_sve_cmple_wide:
2315 case Intrinsic::aarch64_sve_cmplo_wide:
2316 case Intrinsic::aarch64_sve_cmpls_wide:
2317 case Intrinsic::aarch64_sve_cmplt_wide:
2318 case Intrinsic::aarch64_sve_facge:
2319 case Intrinsic::aarch64_sve_facgt:
2320 case Intrinsic::aarch64_sve_fcmpeq:
2321 case Intrinsic::aarch64_sve_fcmpge:
2322 case Intrinsic::aarch64_sve_fcmpgt:
2323 case Intrinsic::aarch64_sve_fcmpne:
2324 case Intrinsic::aarch64_sve_fcmpuo:
2325 case Intrinsic::aarch64_sve_ld1_gather:
2326 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2327 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2328 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2329 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2330 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2331 case Intrinsic::aarch64_sve_ld1q_gather_index:
2332 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2333 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2334 case Intrinsic::aarch64_sve_ld1ro:
2335 case Intrinsic::aarch64_sve_ld1rq:
2336 case Intrinsic::aarch64_sve_ld1udq:
2337 case Intrinsic::aarch64_sve_ld1uwq:
2338 case Intrinsic::aarch64_sve_ld2_sret:
2339 case Intrinsic::aarch64_sve_ld2q_sret:
2340 case Intrinsic::aarch64_sve_ld3_sret:
2341 case Intrinsic::aarch64_sve_ld3q_sret:
2342 case Intrinsic::aarch64_sve_ld4_sret:
2343 case Intrinsic::aarch64_sve_ld4q_sret:
2344 case Intrinsic::aarch64_sve_ldff1:
2345 case Intrinsic::aarch64_sve_ldff1_gather:
2346 case Intrinsic::aarch64_sve_ldff1_gather_index:
2347 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2348 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2349 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2350 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2351 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2352 case Intrinsic::aarch64_sve_ldnf1:
2353 case Intrinsic::aarch64_sve_ldnt1:
2354 case Intrinsic::aarch64_sve_ldnt1_gather:
2355 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2356 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2357 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2358 return instCombineSVENoActiveZero(IC, II);
2359 case Intrinsic::aarch64_sve_prf:
2360 case Intrinsic::aarch64_sve_prfb_gather_index:
2361 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2362 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2363 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2364 case Intrinsic::aarch64_sve_prfd_gather_index:
2365 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2366 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2367 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2368 case Intrinsic::aarch64_sve_prfh_gather_index:
2369 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2370 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2371 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2372 case Intrinsic::aarch64_sve_prfw_gather_index:
2373 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2374 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2375 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2376 return instCombineSVENoActiveUnaryErase(IC, II, 0);
2377 case Intrinsic::aarch64_neon_fmaxnm:
2378 case Intrinsic::aarch64_neon_fminnm:
2379 return instCombineMaxMinNM(IC, II);
2380 case Intrinsic::aarch64_sve_convert_from_svbool:
2381 return instCombineConvertFromSVBool(IC, II);
2382 case Intrinsic::aarch64_sve_dup:
2383 return instCombineSVEDup(IC, II);
2384 case Intrinsic::aarch64_sve_dup_x:
2385 return instCombineSVEDupX(IC, II);
2386 case Intrinsic::aarch64_sve_cmpne:
2387 case Intrinsic::aarch64_sve_cmpne_wide:
2388 return instCombineSVECmpNE(IC, II);
2389 case Intrinsic::aarch64_sve_rdffr:
2390 return instCombineRDFFR(IC, II);
2391 case Intrinsic::aarch64_sve_lasta:
2392 case Intrinsic::aarch64_sve_lastb:
2393 return instCombineSVELast(IC, II);
2394 case Intrinsic::aarch64_sve_clasta_n:
2395 case Intrinsic::aarch64_sve_clastb_n:
2396 return instCombineSVECondLast(IC, II);
2397 case Intrinsic::aarch64_sve_cntd:
2398 return instCombineSVECntElts(IC, II, 2);
2399 case Intrinsic::aarch64_sve_cntw:
2400 return instCombineSVECntElts(IC, II, 4);
2401 case Intrinsic::aarch64_sve_cnth:
2402 return instCombineSVECntElts(IC, II, 8);
2403 case Intrinsic::aarch64_sve_cntb:
2404 return instCombineSVECntElts(IC, II, 16);
2405 case Intrinsic::aarch64_sve_ptest_any:
2406 case Intrinsic::aarch64_sve_ptest_first:
2407 case Intrinsic::aarch64_sve_ptest_last:
2408 return instCombineSVEPTest(IC, II);
2409 case Intrinsic::aarch64_sve_fabd:
2410 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2411 case Intrinsic::aarch64_sve_fadd:
2412 return instCombineSVEVectorFAdd(IC, II);
2413 case Intrinsic::aarch64_sve_fadd_u:
2414 return instCombineSVEVectorFAddU(IC, II);
2415 case Intrinsic::aarch64_sve_fdiv:
2416 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2417 case Intrinsic::aarch64_sve_fmax:
2418 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2419 case Intrinsic::aarch64_sve_fmaxnm:
2420 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2421 case Intrinsic::aarch64_sve_fmin:
2422 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2423 case Intrinsic::aarch64_sve_fminnm:
2424 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2425 case Intrinsic::aarch64_sve_fmla:
2426 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2427 case Intrinsic::aarch64_sve_fmls:
2428 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2429 case Intrinsic::aarch64_sve_fmul:
2430 if (auto II_U =
2431 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2432 return II_U;
2433 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2434 case Intrinsic::aarch64_sve_fmul_u:
2435 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2436 case Intrinsic::aarch64_sve_fmulx:
2437 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2438 case Intrinsic::aarch64_sve_fnmla:
2439 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2440 case Intrinsic::aarch64_sve_fnmls:
2441 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2442 case Intrinsic::aarch64_sve_fsub:
2443 return instCombineSVEVectorFSub(IC, II);
2444 case Intrinsic::aarch64_sve_fsub_u:
2445 return instCombineSVEVectorFSubU(IC, II);
2446 case Intrinsic::aarch64_sve_add:
2447 return instCombineSVEVectorAdd(IC, II);
2448 case Intrinsic::aarch64_sve_add_u:
2449 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2450 Intrinsic::aarch64_sve_mla_u>(
2451 IC, II, true);
2452 case Intrinsic::aarch64_sve_mla:
2453 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2454 case Intrinsic::aarch64_sve_mls:
2455 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2456 case Intrinsic::aarch64_sve_mul:
2457 if (auto II_U =
2458 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2459 return II_U;
2460 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2461 case Intrinsic::aarch64_sve_mul_u:
2462 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2463 case Intrinsic::aarch64_sve_sabd:
2464 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2465 case Intrinsic::aarch64_sve_smax:
2466 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2467 case Intrinsic::aarch64_sve_smin:
2468 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2469 case Intrinsic::aarch64_sve_smulh:
2470 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2471 case Intrinsic::aarch64_sve_sub:
2472 return instCombineSVEVectorSub(IC, II);
2473 case Intrinsic::aarch64_sve_sub_u:
2474 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2475 Intrinsic::aarch64_sve_mls_u>(
2476 IC, II, true);
2477 case Intrinsic::aarch64_sve_uabd:
2478 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2479 case Intrinsic::aarch64_sve_umax:
2480 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2481 case Intrinsic::aarch64_sve_umin:
2482 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2483 case Intrinsic::aarch64_sve_umulh:
2484 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2485 case Intrinsic::aarch64_sve_asr:
2486 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2487 case Intrinsic::aarch64_sve_lsl:
2488 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2489 case Intrinsic::aarch64_sve_lsr:
2490 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2491 case Intrinsic::aarch64_sve_and:
2492 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2493 case Intrinsic::aarch64_sve_bic:
2494 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2495 case Intrinsic::aarch64_sve_eor:
2496 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2497 case Intrinsic::aarch64_sve_orr:
2498 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2499 case Intrinsic::aarch64_sve_sqsub:
2500 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2501 case Intrinsic::aarch64_sve_uqsub:
2502 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2503 case Intrinsic::aarch64_sve_tbl:
2504 return instCombineSVETBL(IC, II);
2505 case Intrinsic::aarch64_sve_uunpkhi:
2506 case Intrinsic::aarch64_sve_uunpklo:
2507 case Intrinsic::aarch64_sve_sunpkhi:
2508 case Intrinsic::aarch64_sve_sunpklo:
2509 return instCombineSVEUnpack(IC, II);
2510 case Intrinsic::aarch64_sve_uzp1:
2511 return instCombineSVEUzp1(IC, II);
2512 case Intrinsic::aarch64_sve_zip1:
2513 case Intrinsic::aarch64_sve_zip2:
2514 return instCombineSVEZip(IC, II);
2515 case Intrinsic::aarch64_sve_ld1_gather_index:
2516 return instCombineLD1GatherIndex(IC, II);
2517 case Intrinsic::aarch64_sve_st1_scatter_index:
2518 return instCombineST1ScatterIndex(IC, II);
2519 case Intrinsic::aarch64_sve_ld1:
2520 return instCombineSVELD1(IC, II, DL);
2521 case Intrinsic::aarch64_sve_st1:
2522 return instCombineSVEST1(IC, II, DL);
2523 case Intrinsic::aarch64_sve_sdiv:
2524 return instCombineSVESDIV(IC, II);
2525 case Intrinsic::aarch64_sve_sel:
2526 return instCombineSVESel(IC, II);
2527 case Intrinsic::aarch64_sve_srshl:
2528 return instCombineSVESrshl(IC, II);
2529 case Intrinsic::aarch64_sve_dupq_lane:
2530 return instCombineSVEDupqLane(IC, II);
2531 case Intrinsic::aarch64_sve_insr:
2532 return instCombineSVEInsr(IC, II);
2533 }
2534
2535 return std::nullopt;
2536}
2537
2539 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2540 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2541 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2542 SimplifyAndSetOp) const {
2543 switch (II.getIntrinsicID()) {
2544 default:
2545 break;
2546 case Intrinsic::aarch64_neon_fcvtxn:
2547 case Intrinsic::aarch64_neon_rshrn:
2548 case Intrinsic::aarch64_neon_sqrshrn:
2549 case Intrinsic::aarch64_neon_sqrshrun:
2550 case Intrinsic::aarch64_neon_sqshrn:
2551 case Intrinsic::aarch64_neon_sqshrun:
2552 case Intrinsic::aarch64_neon_sqxtn:
2553 case Intrinsic::aarch64_neon_sqxtun:
2554 case Intrinsic::aarch64_neon_uqrshrn:
2555 case Intrinsic::aarch64_neon_uqshrn:
2556 case Intrinsic::aarch64_neon_uqxtn:
2557 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2558 break;
2559 }
2560
2561 return std::nullopt;
2562}
2563
2565 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2567}
2568
2571 switch (K) {
2573 return TypeSize::getFixed(64);
2575 if (ST->useSVEForFixedLengthVectors() &&
2577 return TypeSize::getFixed(
2578 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2579 else if (ST->isNeonAvailable())
2580 return TypeSize::getFixed(128);
2581 else
2582 return TypeSize::getFixed(0);
2584 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2586 return TypeSize::getScalable(128);
2587 else
2588 return TypeSize::getScalable(0);
2589 }
2590 llvm_unreachable("Unsupported register kind");
2591}
2592
2593bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2595 Type *SrcOverrideTy) {
2596 // A helper that returns a vector type from the given type. The number of
2597 // elements in type Ty determines the vector width.
2598 auto toVectorTy = [&](Type *ArgTy) {
2599 return VectorType::get(ArgTy->getScalarType(),
2600 cast<VectorType>(DstTy)->getElementCount());
2601 };
2602
2603 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2604 // i32, i64]. SVE doesn't generally have the same set of instructions to
2605 // perform an extend with the add/sub/mul. There are SMULLB style
2606 // instructions, but they operate on top/bottom, requiring some sort of lane
2607 // interleaving to be used with zext/sext.
2608 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2609 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2610 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2611 return false;
2612
2613 // Determine if the operation has a widening variant. We consider both the
2614 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2615 // instructions.
2616 //
2617 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2618 // verify that their extending operands are eliminated during code
2619 // generation.
2620 Type *SrcTy = SrcOverrideTy;
2621 switch (Opcode) {
2622 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2623 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2624 // The second operand needs to be an extend
2625 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2626 if (!SrcTy)
2627 SrcTy =
2628 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2629 } else
2630 return false;
2631 break;
2632 case Instruction::Mul: { // SMULL(2), UMULL(2)
2633 // Both operands need to be extends of the same type.
2634 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2635 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2636 if (!SrcTy)
2637 SrcTy =
2638 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2639 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2640 // If one of the operands is a Zext and the other has enough zero bits to
2641 // be treated as unsigned, we can still general a umull, meaning the zext
2642 // is free.
2643 KnownBits Known =
2644 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2645 if (Args[0]->getType()->getScalarSizeInBits() -
2646 Known.Zero.countLeadingOnes() >
2647 DstTy->getScalarSizeInBits() / 2)
2648 return false;
2649 if (!SrcTy)
2650 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2651 DstTy->getScalarSizeInBits() / 2));
2652 } else
2653 return false;
2654 break;
2655 }
2656 default:
2657 return false;
2658 }
2659
2660 // Legalize the destination type and ensure it can be used in a widening
2661 // operation.
2662 auto DstTyL = getTypeLegalizationCost(DstTy);
2663 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2664 return false;
2665
2666 // Legalize the source type and ensure it can be used in a widening
2667 // operation.
2668 assert(SrcTy && "Expected some SrcTy");
2669 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2670 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2671 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2672 return false;
2673
2674 // Get the total number of vector elements in the legalized types.
2675 InstructionCost NumDstEls =
2676 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2677 InstructionCost NumSrcEls =
2678 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2679
2680 // Return true if the legalized types have the same number of vector elements
2681 // and the destination element type size is twice that of the source type.
2682 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2683}
2684
2685// s/urhadd instructions implement the following pattern, making the
2686// extends free:
2687// %x = add ((zext i8 -> i16), 1)
2688// %y = (zext i8 -> i16)
2689// trunc i16 (lshr (add %x, %y), 1) -> i8
2690//
2692 Type *Src) {
2693 // The source should be a legal vector type.
2694 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2695 (Src->isScalableTy() && !ST->hasSVE2()))
2696 return false;
2697
2698 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2699 return false;
2700
2701 // Look for trunc/shl/add before trying to match the pattern.
2702 const Instruction *Add = ExtUser;
2703 auto *AddUser =
2704 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2705 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2706 Add = AddUser;
2707
2708 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2709 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2710 return false;
2711
2712 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2713 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2714 Src->getScalarSizeInBits() !=
2715 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2716 return false;
2717
2718 // Try to match the whole pattern. Ext could be either the first or second
2719 // m_ZExtOrSExt matched.
2720 Instruction *Ex1, *Ex2;
2721 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2722 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2723 return false;
2724
2725 // Ensure both extends are of the same type
2726 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2727 Ex1->getOpcode() == Ex2->getOpcode())
2728 return true;
2729
2730 return false;
2731}
2732
2734 Type *Src,
2737 const Instruction *I) {
2738 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2739 assert(ISD && "Invalid opcode");
2740 // If the cast is observable, and it is used by a widening instruction (e.g.,
2741 // uaddl, saddw, etc.), it may be free.
2742 if (I && I->hasOneUser()) {
2743 auto *SingleUser = cast<Instruction>(*I->user_begin());
2744 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2745 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2746 // For adds only count the second operand as free if both operands are
2747 // extends but not the same operation. (i.e both operands are not free in
2748 // add(sext, zext)).
2749 if (SingleUser->getOpcode() == Instruction::Add) {
2750 if (I == SingleUser->getOperand(1) ||
2751 (isa<CastInst>(SingleUser->getOperand(1)) &&
2752 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2753 return 0;
2754 } else // Others are free so long as isWideningInstruction returned true.
2755 return 0;
2756 }
2757
2758 // The cast will be free for the s/urhadd instructions
2759 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2760 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2761 return 0;
2762 }
2763
2764 // TODO: Allow non-throughput costs that aren't binary.
2765 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2767 return Cost == 0 ? 0 : 1;
2768 return Cost;
2769 };
2770
2771 EVT SrcTy = TLI->getValueType(DL, Src);
2772 EVT DstTy = TLI->getValueType(DL, Dst);
2773
2774 if (!SrcTy.isSimple() || !DstTy.isSimple())
2775 return AdjustCost(
2776 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2777
2778 static const TypeConversionCostTblEntry BF16Tbl[] = {
2779 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
2780 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
2781 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
2782 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
2783 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
2784 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
2785 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
2786 };
2787
2788 if (ST->hasBF16())
2789 if (const auto *Entry = ConvertCostTableLookup(
2790 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2791 return AdjustCost(Entry->Cost);
2792
2793 static const TypeConversionCostTblEntry ConversionTbl[] = {
2794 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2795 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2796 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2797 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2798 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2799 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2800 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2801 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2802 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2803 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2804 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2805 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2806 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2807 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2808 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2809 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2810 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2811 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2812 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2813 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2814
2815 // Truncations on nxvmiN
2816 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
2817 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
2818 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
2819 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
2820 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
2821 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
2822 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
2823 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
2824 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
2825 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
2826 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
2827 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
2828 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
2829 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
2830 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
2831 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
2832 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
2833 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
2834 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
2835 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
2836 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
2837 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
2838 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
2839 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
2840 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
2841 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
2842 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
2843 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
2844 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
2845 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
2846 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
2847 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
2848 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
2849
2850 // The number of shll instructions for the extension.
2851 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2852 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2853 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2854 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2855 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2856 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2857 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2858 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2859 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2860 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2861 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2862 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2863 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2864 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2865 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2866 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2867
2868 // FP Ext and trunc
2869 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
2870 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
2871 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
2872 // FP16
2873 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
2874 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
2875 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
2876 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
2877 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
2878 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
2879 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
2880 // BF16 (uses shift)
2881 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
2882 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
2883 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
2884 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
2885 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
2886 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
2887 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
2888 // FP Ext and trunc
2889 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
2890 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
2891 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
2892 // FP16
2893 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
2894 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
2895 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
2896 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
2897 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
2898 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
2899 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
2900 // BF16 (more complex, with +bf16 is handled above)
2901 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
2902 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
2903 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
2904 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
2905 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
2906 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
2907 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
2908 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
2909
2910 // LowerVectorINT_TO_FP:
2911 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2912 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2913 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2914 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2915 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2916 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2917
2918 // Complex: to v2f32
2919 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2920 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2921 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2922 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2923 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2924 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2925
2926 // Complex: to v4f32
2927 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
2928 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2929 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
2930 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2931
2932 // Complex: to v8f32
2933 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2934 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2935 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2936 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2937
2938 // Complex: to v16f32
2939 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2940 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2941
2942 // Complex: to v2f64
2943 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2944 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2945 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2946 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2947 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2948 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2949
2950 // Complex: to v4f64
2951 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2952 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2953
2954 // LowerVectorFP_TO_INT
2955 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
2956 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
2957 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
2958 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
2959 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
2960 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
2961
2962 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2963 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
2964 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
2965 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
2966 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
2967 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
2968 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
2969
2970 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2971 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
2972 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
2973 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
2974 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
2975
2976 // Complex, from nxv2f32.
2977 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2978 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2979 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2980 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2981 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2982 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2983 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2984 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2985
2986 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2987 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
2988 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
2989 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
2990 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
2991 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
2992 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
2993
2994 // Complex, from nxv2f64.
2995 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2996 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2997 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2998 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2999 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3000 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3001 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3002 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3003
3004 // Complex, from nxv4f32.
3005 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3006 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3007 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3008 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3009 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3010 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3011 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3012 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3013
3014 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3015 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3016 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3017 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3018 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3019
3020 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3021 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3022 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3023 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3024 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3025 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3026 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3027
3028 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3029 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3030 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3031 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3032 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3033
3034 // Complex, from nxv8f16.
3035 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3036 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3037 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3038 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3039 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3040 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3041 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3042 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3043
3044 // Complex, from nxv4f16.
3045 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3046 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3047 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3048 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3049 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3050 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3051 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3052 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3053
3054 // Complex, from nxv2f16.
3055 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3056 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3057 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3058 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3059 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3060 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3061 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3062 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3063
3064 // Truncate from nxvmf32 to nxvmf16.
3065 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3066 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3067 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3068
3069 // Truncate from nxvmf64 to nxvmf16.
3070 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3071 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3072 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3073
3074 // Truncate from nxvmf64 to nxvmf32.
3075 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3076 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3077 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3078
3079 // Extend from nxvmf16 to nxvmf32.
3080 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3081 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3082 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3083
3084 // Extend from nxvmf16 to nxvmf64.
3085 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3086 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3087 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3088
3089 // Extend from nxvmf32 to nxvmf64.
3090 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3091 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3092 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3093
3094 // Bitcasts from float to integer
3095 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3096 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3097 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3098
3099 // Bitcasts from integer to float
3100 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3101 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3102 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3103
3104 // Add cost for extending to illegal -too wide- scalable vectors.
3105 // zero/sign extend are implemented by multiple unpack operations,
3106 // where each operation has a cost of 1.
3107 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3108 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3109 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3110 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3111 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3112 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3113
3114 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3115 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3116 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3117 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3118 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3119 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3120 };
3121
3122 // We have to estimate a cost of fixed length operation upon
3123 // SVE registers(operations) with the number of registers required
3124 // for a fixed type to be represented upon SVE registers.
3125 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3126 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3127 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3128 ST->useSVEForFixedLengthVectors(WiderTy)) {
3129 std::pair<InstructionCost, MVT> LT =
3130 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3131 unsigned NumElements =
3132 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3133 return AdjustCost(
3134 LT.first *
3136 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3137 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3138 CostKind, I));
3139 }
3140
3141 if (const auto *Entry = ConvertCostTableLookup(
3142 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3143 return AdjustCost(Entry->Cost);
3144
3145 static const TypeConversionCostTblEntry FP16Tbl[] = {
3146 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3147 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3148 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3149 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3150 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3151 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3152 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3153 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3154 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3155 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3156 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3157 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3158 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3159 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3160 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3161 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3162 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3163 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3164 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3165 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3166 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3167 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3168 };
3169
3170 if (ST->hasFullFP16())
3171 if (const auto *Entry = ConvertCostTableLookup(
3172 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3173 return AdjustCost(Entry->Cost);
3174
3175 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3178 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3180 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3182 // The standard behaviour in the backend for these cases is to split the
3183 // extend up into two parts:
3184 // 1. Perform an extending load or masked load up to the legal type.
3185 // 2. Extend the loaded data to the final type.
3186 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3187 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3189 Opcode, LegalTy, Src, CCH, CostKind, I);
3191 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3192 return Part1 + Part2;
3193 }
3194
3195 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3196 // but we also want to include the TTI::CastContextHint::Masked case too.
3197 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3199 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3201
3202 return AdjustCost(
3203 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3204}
3205
3207 Type *Dst,
3208 VectorType *VecTy,
3209 unsigned Index) {
3210
3211 // Make sure we were given a valid extend opcode.
3212 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3213 "Invalid opcode");
3214
3215 // We are extending an element we extract from a vector, so the source type
3216 // of the extend is the element type of the vector.
3217 auto *Src = VecTy->getElementType();
3218
3219 // Sign- and zero-extends are for integer types only.
3220 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3221
3222 // Get the cost for the extract. We compute the cost (if any) for the extend
3223 // below.
3225 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3226 CostKind, Index, nullptr, nullptr);
3227
3228 // Legalize the types.
3229 auto VecLT = getTypeLegalizationCost(VecTy);
3230 auto DstVT = TLI->getValueType(DL, Dst);
3231 auto SrcVT = TLI->getValueType(DL, Src);
3232
3233 // If the resulting type is still a vector and the destination type is legal,
3234 // we may get the extension for free. If not, get the default cost for the
3235 // extend.
3236 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3237 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3238 CostKind);
3239
3240 // The destination type should be larger than the element type. If not, get
3241 // the default cost for the extend.
3242 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3243 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3244 CostKind);
3245
3246 switch (Opcode) {
3247 default:
3248 llvm_unreachable("Opcode should be either SExt or ZExt");
3249
3250 // For sign-extends, we only need a smov, which performs the extension
3251 // automatically.
3252 case Instruction::SExt:
3253 return Cost;
3254
3255 // For zero-extends, the extend is performed automatically by a umov unless
3256 // the destination type is i64 and the element type is i8 or i16.
3257 case Instruction::ZExt:
3258 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3259 return Cost;
3260 }
3261
3262 // If we are unable to perform the extend for free, get the default cost.
3263 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3264 CostKind);
3265}
3266
3269 const Instruction *I) {
3271 return Opcode == Instruction::PHI ? 0 : 1;
3272 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3273 // Branches are assumed to be predicted.
3274 return 0;
3275}
3276
3277InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3278 unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3279 const Instruction *I, Value *Scalar,
3280 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3281 assert(Val->isVectorTy() && "This must be a vector type");
3282
3283 if (Index != -1U) {
3284 // Legalize the type.
3285 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3286
3287 // This type is legalized to a scalar type.
3288 if (!LT.second.isVector())
3289 return 0;
3290
3291 // The type may be split. For fixed-width vectors we can normalize the
3292 // index to the new type.
3293 if (LT.second.isFixedLengthVector()) {
3294 unsigned Width = LT.second.getVectorNumElements();
3295 Index = Index % Width;
3296 }
3297
3298 // The element at index zero is already inside the vector.
3299 // - For a physical (HasRealUse==true) insert-element or extract-element
3300 // instruction that extracts integers, an explicit FPR -> GPR move is
3301 // needed. So it has non-zero cost.
3302 // - For the rest of cases (virtual instruction or element type is float),
3303 // consider the instruction free.
3304 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3305 return 0;
3306
3307 // This is recognising a LD1 single-element structure to one lane of one
3308 // register instruction. I.e., if this is an `insertelement` instruction,
3309 // and its second operand is a load, then we will generate a LD1, which
3310 // are expensive instructions.
3311 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3312 return ST->getVectorInsertExtractBaseCost() + 1;
3313
3314 // i1 inserts and extract will include an extra cset or cmp of the vector
3315 // value. Increase the cost by 1 to account.
3316 if (Val->getScalarSizeInBits() == 1)
3317 return ST->getVectorInsertExtractBaseCost() + 1;
3318
3319 // FIXME:
3320 // If the extract-element and insert-element instructions could be
3321 // simplified away (e.g., could be combined into users by looking at use-def
3322 // context), they have no cost. This is not done in the first place for
3323 // compile-time considerations.
3324 }
3325
3326 // In case of Neon, if there exists extractelement from lane != 0 such that
3327 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3328 // 2. extractelement result feeds into fmul.
3329 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3330 // equivalent to 0.
3331 // then the extractelement can be merged with fmul in the backend and it
3332 // incurs no cost.
3333 // e.g.
3334 // define double @foo(<2 x double> %a) {
3335 // %1 = extractelement <2 x double> %a, i32 0
3336 // %2 = extractelement <2 x double> %a, i32 1
3337 // %res = fmul double %1, %2
3338 // ret double %res
3339 // }
3340 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3341 auto ExtractCanFuseWithFmul = [&]() {
3342 // We bail out if the extract is from lane 0.
3343 if (Index == 0)
3344 return false;
3345
3346 // Check if the scalar element type of the vector operand of ExtractElement
3347 // instruction is one of the allowed types.
3348 auto IsAllowedScalarTy = [&](const Type *T) {
3349 return T->isFloatTy() || T->isDoubleTy() ||
3350 (T->isHalfTy() && ST->hasFullFP16());
3351 };
3352
3353 // Check if the extractelement user is scalar fmul.
3354 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3355 // Check if the user is scalar fmul.
3356 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3357 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3358 !BO->getType()->isVectorTy();
3359 };
3360
3361 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3362 // certain scalar type and a certain vector register width.
3363 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3364 auto RegWidth =
3366 .getFixedValue();
3367 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3368 };
3369
3370 // Check if the type constraints on input vector type and result scalar type
3371 // of extractelement instruction are satisfied.
3372 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3373 return false;
3374
3375 if (Scalar) {
3376 DenseMap<User *, unsigned> UserToExtractIdx;
3377 for (auto *U : Scalar->users()) {
3378 if (!IsUserFMulScalarTy(U))
3379 return false;
3380 // Recording entry for the user is important. Index value is not
3381 // important.
3382 UserToExtractIdx[U];
3383 }
3384 if (UserToExtractIdx.empty())
3385 return false;
3386 for (auto &[S, U, L] : ScalarUserAndIdx) {
3387 for (auto *U : S->users()) {
3388 if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3389 auto *FMul = cast<BinaryOperator>(U);
3390 auto *Op0 = FMul->getOperand(0);
3391 auto *Op1 = FMul->getOperand(1);
3392 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3393 UserToExtractIdx[U] = L;
3394 break;
3395 }
3396 }
3397 }
3398 }
3399 for (auto &[U, L] : UserToExtractIdx) {
3400 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3401 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3402 return false;
3403 }
3404 } else {
3405 const auto *EE = cast<ExtractElementInst>(I);
3406
3407 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3408 if (!IdxOp)
3409 return false;
3410
3411 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3412 if (!IsUserFMulScalarTy(U))
3413 return false;
3414
3415 // Check if the other operand of extractelement is also extractelement
3416 // from lane equivalent to 0.
3417 const auto *BO = cast<BinaryOperator>(U);
3418 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3419 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3420 if (OtherEE) {
3421 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3422 if (!IdxOp)
3423 return false;
3424 return IsExtractLaneEquivalentToZero(
3425 cast<ConstantInt>(OtherEE->getIndexOperand())
3426 ->getValue()
3427 .getZExtValue(),
3428 OtherEE->getType()->getScalarSizeInBits());
3429 }
3430 return true;
3431 });
3432 }
3433 return true;
3434 };
3435
3436 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3437 ExtractCanFuseWithFmul())
3438 return 0;
3439
3440 // All other insert/extracts cost this much.
3441 return ST->getVectorInsertExtractBaseCost();
3442}
3443
3446 unsigned Index, Value *Op0,
3447 Value *Op1) {
3448 bool HasRealUse =
3449 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3450 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3451}
3452
3454 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3455 Value *Scalar,
3456 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3457 return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3458 ScalarUserAndIdx);
3459}
3460
3462 Type *Val,
3464 unsigned Index) {
3465 return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3466 true /* HasRealUse */, &I);
3467}
3468
3470 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3472 if (isa<ScalableVectorType>(Ty))
3474 if (Ty->getElementType()->isFloatingPointTy())
3475 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3476 CostKind);
3477 return DemandedElts.popcount() * (Insert + Extract) *
3479}
3480
3482 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3485 const Instruction *CxtI) {
3486
3487 // The code-generator is currently not able to handle scalable vectors
3488 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3489 // it. This change will be removed when code-generation for these types is
3490 // sufficiently reliable.
3491 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3492 if (VTy->getElementCount() == ElementCount::getScalable(1))
3494
3495 // TODO: Handle more cost kinds.
3497 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3498 Op2Info, Args, CxtI);
3499
3500 // Legalize the type.
3501 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3502 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3503
3504 switch (ISD) {
3505 default:
3506 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3507 Op2Info);
3508 case ISD::SDIV:
3509 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3510 // On AArch64, scalar signed division by constants power-of-two are
3511 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3512 // The OperandValue properties many not be same as that of previous
3513 // operation; conservatively assume OP_None.
3515 Instruction::Add, Ty, CostKind,
3516 Op1Info.getNoProps(), Op2Info.getNoProps());
3517 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3518 Op1Info.getNoProps(), Op2Info.getNoProps());
3520 Instruction::Select, Ty, CostKind,
3521 Op1Info.getNoProps(), Op2Info.getNoProps());
3522 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3523 Op1Info.getNoProps(), Op2Info.getNoProps());
3524 return Cost;
3525 }
3526 [[fallthrough]];
3527 case ISD::UDIV: {
3528 auto VT = TLI->getValueType(DL, Ty);
3529 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3530 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3531 // Vector signed division by constant are expanded to the
3532 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3533 // to MULHS + SUB + SRL + ADD + SRL.
3535 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3537 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3539 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3540 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3541 }
3542 }
3543
3544 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
3545 // emitted by the backend even when those functions are not declared in the
3546 // module.
3547 if (!VT.isVector() && VT.getSizeInBits() > 64)
3548 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3549
3551 Opcode, Ty, CostKind, Op1Info, Op2Info);
3552 if (Ty->isVectorTy()) {
3553 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3554 // SDIV/UDIV operations are lowered using SVE, then we can have less
3555 // costs.
3556 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3557 ->getPrimitiveSizeInBits()
3558 .getFixedValue() < 128) {
3559 EVT VT = TLI->getValueType(DL, Ty);
3560 static const CostTblEntry DivTbl[]{
3561 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3562 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3563 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3564 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3565 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3566 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3567
3568 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3569 if (nullptr != Entry)
3570 return Entry->Cost;
3571 }
3572 // For 8/16-bit elements, the cost is higher because the type
3573 // requires promotion and possibly splitting:
3574 if (LT.second.getScalarType() == MVT::i8)
3575 Cost *= 8;
3576 else if (LT.second.getScalarType() == MVT::i16)
3577 Cost *= 4;
3578 return Cost;
3579 } else {
3580 // If one of the operands is a uniform constant then the cost for each
3581 // element is Cost for insertion, extraction and division.
3582 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3583 // operation with scalar type
3584 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3585 (Op2Info.isConstant() && Op2Info.isUniform())) {
3586 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3588 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3589 return (4 + DivCost) * VTy->getNumElements();
3590 }
3591 }
3592 // On AArch64, without SVE, vector divisions are expanded
3593 // into scalar divisions of each pair of elements.
3594 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3595 CostKind, Op1Info, Op2Info);
3596 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3597 Op1Info, Op2Info);
3598 }
3599
3600 // TODO: if one of the arguments is scalar, then it's not necessary to
3601 // double the cost of handling the vector elements.
3602 Cost += Cost;
3603 }
3604 return Cost;
3605 }
3606 case ISD::MUL:
3607 // When SVE is available, then we can lower the v2i64 operation using
3608 // the SVE mul instruction, which has a lower cost.
3609 if (LT.second == MVT::v2i64 && ST->hasSVE())
3610 return LT.first;
3611
3612 // When SVE is not available, there is no MUL.2d instruction,
3613 // which means mul <2 x i64> is expensive as elements are extracted
3614 // from the vectors and the muls scalarized.
3615 // As getScalarizationOverhead is a bit too pessimistic, we
3616 // estimate the cost for a i64 vector directly here, which is:
3617 // - four 2-cost i64 extracts,
3618 // - two 2-cost i64 inserts, and
3619 // - two 1-cost muls.
3620 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3621 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3622 // need to scalarize so the cost can be cheaper (smull or umull).
3623 // so the cost can be cheaper (smull or umull).
3624 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3625 return LT.first;
3626 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
3628 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
3629 nullptr, nullptr) *
3630 2 +
3631 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
3632 nullptr, nullptr));
3633 case ISD::ADD:
3634 case ISD::XOR:
3635 case ISD::OR:
3636 case ISD::AND:
3637 case ISD::SRL:
3638 case ISD::SRA:
3639 case ISD::SHL:
3640 // These nodes are marked as 'custom' for combining purposes only.
3641 // We know that they are legal. See LowerAdd in ISelLowering.
3642 return LT.first;
3643
3644 case ISD::FNEG:
3645 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3646 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3647 (Ty->isHalfTy() && ST->hasFullFP16())) &&
3648 CxtI &&
3649 ((CxtI->hasOneUse() &&
3650 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3651 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3652 return 0;
3653 [[fallthrough]];
3654 case ISD::FADD:
3655 case ISD::FSUB:
3656 // Increase the cost for half and bfloat types if not architecturally
3657 // supported.
3658 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3659 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3660 return 2 * LT.first;
3661 if (!Ty->getScalarType()->isFP128Ty())
3662 return LT.first;
3663 [[fallthrough]];
3664 case ISD::FMUL:
3665 case ISD::FDIV:
3666 // These nodes are marked as 'custom' just to lower them to SVE.
3667 // We know said lowering will incur no additional cost.
3668 if (!Ty->getScalarType()->isFP128Ty())
3669 return 2 * LT.first;
3670
3671 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3672 Op2Info);
3673 case ISD::FREM:
3674 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3675 // those functions are not declared in the module.
3676 if (!Ty->isVectorTy())
3677 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3678 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3679 Op2Info);
3680 }
3681}
3682
3684 ScalarEvolution *SE,
3685 const SCEV *Ptr) {
3686 // Address computations in vectorized code with non-consecutive addresses will
3687 // likely result in more instructions compared to scalar code where the
3688 // computation can more often be merged into the index mode. The resulting
3689 // extra micro-ops can significantly decrease throughput.
3690 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3691 int MaxMergeDistance = 64;
3692
3693 if (Ty->isVectorTy() && SE &&
3694 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3695 return NumVectorInstToHideOverhead;
3696
3697 // In many cases the address computation is not merged into the instruction
3698 // addressing mode.
3699 return 1;
3700}
3701
3703 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3705 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3706 // TODO: Handle other cost kinds.
3708 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3709 Op1Info, Op2Info, I);
3710
3711 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3712 // We don't lower some vector selects well that are wider than the register
3713 // width.
3714 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3715 // We would need this many instructions to hide the scalarization happening.
3716 const int AmortizationCost = 20;
3717
3718 // If VecPred is not set, check if we can get a predicate from the context
3719 // instruction, if its type matches the requested ValTy.
3720 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3721 CmpPredicate CurrentPred;
3722 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3723 m_Value())))
3724 VecPred = CurrentPred;
3725 }
3726 // Check if we have a compare/select chain that can be lowered using
3727 // a (F)CMxx & BFI pair.
3728 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3729 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3730 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3731 VecPred == CmpInst::FCMP_UNE) {
3732 static const auto ValidMinMaxTys = {
3733 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3734 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3735 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3736
3737 auto LT = getTypeLegalizationCost(ValTy);
3738 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3739 (ST->hasFullFP16() &&
3740 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3741 return LT.first;
3742 }
3743
3744 static const TypeConversionCostTblEntry
3745 VectorSelectTbl[] = {
3746 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3747 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3748 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3749 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3750 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3751 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3752 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3753 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3754 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3755 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3756 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3757 };
3758
3759 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3760 EVT SelValTy = TLI->getValueType(DL, ValTy);
3761 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3762 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3763 SelCondTy.getSimpleVT(),
3764 SelValTy.getSimpleVT()))
3765 return Entry->Cost;
3766 }
3767 }
3768
3769 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3770 auto LT = getTypeLegalizationCost(ValTy);
3771 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3772 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3773 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3774 }
3775
3776 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3777 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3778 // be profitable.
3779 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3780 ICmpInst::isEquality(VecPred) &&
3781 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3782 match(I->getOperand(1), m_Zero()) &&
3783 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3784 return 0;
3785
3786 // The base case handles scalable vectors fine for now, since it treats the
3787 // cost as 1 * legalization cost.
3788 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3789 Op1Info, Op2Info, I);
3790}
3791
3793AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3795 if (ST->requiresStrictAlign()) {
3796 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3797 // a bunch of instructions when strict align is enabled.
3798 return Options;
3799 }
3800 Options.AllowOverlappingLoads = true;
3801 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3802 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3803 // TODO: Though vector loads usually perform well on AArch64, in some targets
3804 // they may wake up the FP unit, which raises the power consumption. Perhaps
3805 // they could be used with no holds barred (-O3).
3806 Options.LoadSizes = {8, 4, 2, 1};
3807 Options.AllowedTailExpansions = {3, 5, 6};
3808 return Options;
3809}
3810
3812 return ST->hasSVE();
3813}
3814
3817 Align Alignment, unsigned AddressSpace,
3819 if (useNeonVector(Src))
3820 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3821 CostKind);
3822 auto LT = getTypeLegalizationCost(Src);
3823 if (!LT.first.isValid())
3825
3826 // Return an invalid cost for element types that we are unable to lower.
3827 auto *VT = cast<VectorType>(Src);
3828 if (VT->getElementType()->isIntegerTy(1))
3830
3831 // The code-generator is currently not able to handle scalable vectors
3832 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3833 // it. This change will be removed when code-generation for these types is
3834 // sufficiently reliable.
3835 if (VT->getElementCount() == ElementCount::getScalable(1))
3837
3838 return LT.first;
3839}
3840
3841// This function returns gather/scatter overhead either from
3842// user-provided value or specialized values per-target from \p ST.
3843static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3844 const AArch64Subtarget *ST) {
3845 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3846 "Should be called on only load or stores.");
3847 switch (Opcode) {
3848 case Instruction::Load:
3849 if (SVEGatherOverhead.getNumOccurrences() > 0)
3850 return SVEGatherOverhead;
3851 return ST->getGatherOverhead();
3852 break;
3853 case Instruction::Store:
3854 if (SVEScatterOverhead.getNumOccurrences() > 0)
3855 return SVEScatterOverhead;
3856 return ST->getScatterOverhead();
3857 break;
3858 default:
3859 llvm_unreachable("Shouldn't have reached here");
3860 }
3861}
3862
3864 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3865 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3866 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3867 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3868 Alignment, CostKind, I);
3869 auto *VT = cast<VectorType>(DataTy);
3870 auto LT = getTypeLegalizationCost(DataTy);
3871 if (!LT.first.isValid())
3873
3874 // Return an invalid cost for element types that we are unable to lower.
3875 if (!LT.second.isVector() ||
3876 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3877 VT->getElementType()->isIntegerTy(1))
3879
3880 // The code-generator is currently not able to handle scalable vectors
3881 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3882 // it. This change will be removed when code-generation for these types is
3883 // sufficiently reliable.
3884 if (VT->getElementCount() == ElementCount::getScalable(1))
3886
3887 ElementCount LegalVF = LT.second.getVectorElementCount();
3888 InstructionCost MemOpCost =
3889 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3890 {TTI::OK_AnyValue, TTI::OP_None}, I);
3891 // Add on an overhead cost for using gathers/scatters.
3892 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3893 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3894}
3895
3897 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3898}
3899
3901 MaybeAlign Alignment,
3902 unsigned AddressSpace,
3904 TTI::OperandValueInfo OpInfo,
3905 const Instruction *I) {
3906 EVT VT = TLI->getValueType(DL, Ty, true);
3907 // Type legalization can't handle structs
3908 if (VT == MVT::Other)
3909 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3910 CostKind);
3911
3912 auto LT = getTypeLegalizationCost(Ty);
3913 if (!LT.first.isValid())
3915
3916 // The code-generator is currently not able to handle scalable vectors
3917 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3918 // it. This change will be removed when code-generation for these types is
3919 // sufficiently reliable.
3920 // We also only support full register predicate loads and stores.
3921 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3922 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3923 (VTy->getElementType()->isIntegerTy(1) &&
3924 !VTy->getElementCount().isKnownMultipleOf(
3927
3928 // TODO: consider latency as well for TCK_SizeAndLatency.
3930 return LT.first;
3931
3933 return 1;
3934
3935 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3936 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3937 // Unaligned stores are extremely inefficient. We don't split all
3938 // unaligned 128-bit stores because the negative impact that has shown in
3939 // practice on inlined block copy code.
3940 // We make such stores expensive so that we will only vectorize if there
3941 // are 6 other instructions getting vectorized.
3942 const int AmortizationCost = 6;
3943
3944 return LT.first * 2 * AmortizationCost;
3945 }
3946
3947 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3948 if (Ty->isPtrOrPtrVectorTy())
3949 return LT.first;
3950
3951 if (useNeonVector(Ty)) {
3952 // Check truncating stores and extending loads.
3953 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3954 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3955 if (VT == MVT::v4i8)
3956 return 2;
3957 // Otherwise we need to scalarize.
3958 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3959 }
3960 EVT EltVT = VT.getVectorElementType();
3961 unsigned EltSize = EltVT.getScalarSizeInBits();
3962 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3963 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3964 *Alignment != Align(1))
3965 return LT.first;
3966 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3967 // widening to v4i8, which produces suboptimal results.
3968 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3969 return LT.first;
3970
3971 // Check non-power-of-2 loads/stores for legal vector element types with
3972 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3973 // operations on smaller power-of-2 ops, including ld1/st1.
3974 LLVMContext &C = Ty->getContext();
3976 SmallVector<EVT> TypeWorklist;
3977 TypeWorklist.push_back(VT);
3978 while (!TypeWorklist.empty()) {
3979 EVT CurrVT = TypeWorklist.pop_back_val();
3980 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3981 if (isPowerOf2_32(CurrNumElements)) {
3982 Cost += 1;
3983 continue;
3984 }
3985
3986 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3987 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3988 TypeWorklist.push_back(
3989 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3990 }
3991 return Cost;
3992 }
3993
3994 return LT.first;
3995}
3996
3998 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3999 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4000 bool UseMaskForCond, bool UseMaskForGaps) {
4001 assert(Factor >= 2 && "Invalid interleave factor");
4002 auto *VecVTy = cast<VectorType>(VecTy);
4003
4004 if (VecTy->isScalableTy() && !ST->hasSVE())
4006
4007 // Vectorization for masked interleaved accesses is only enabled for scalable
4008 // VF.
4009 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
4011
4012 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4013 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4014 auto *SubVecTy =
4015 VectorType::get(VecVTy->getElementType(),
4016 VecVTy->getElementCount().divideCoefficientBy(Factor));
4017
4018 // ldN/stN only support legal vector types of size 64 or 128 in bits.
4019 // Accesses having vector types that are a multiple of 128 bits can be
4020 // matched to more than one ldN/stN instruction.
4021 bool UseScalable;
4022 if (MinElts % Factor == 0 &&
4023 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4024 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4025 }
4026
4027 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4028 Alignment, AddressSpace, CostKind,
4029 UseMaskForCond, UseMaskForGaps);
4030}
4031
4036 for (auto *I : Tys) {
4037 if (!I->isVectorTy())
4038 continue;
4039 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4040 128)
4041 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4042 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4043 }
4044 return Cost;
4045}
4046
4048 return ST->getMaxInterleaveFactor();
4049}
4050
4051// For Falkor, we want to avoid having too many strided loads in a loop since
4052// that can exhaust the HW prefetcher resources. We adjust the unroller
4053// MaxCount preference below to attempt to ensure unrolling doesn't create too
4054// many strided loads.
4055static void
4058 enum { MaxStridedLoads = 7 };
4059 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4060 int StridedLoads = 0;
4061 // FIXME? We could make this more precise by looking at the CFG and
4062 // e.g. not counting loads in each side of an if-then-else diamond.
4063 for (const auto BB : L->blocks()) {
4064 for (auto &I : *BB) {
4065 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4066 if (!LMemI)
4067 continue;
4068
4069 Value *PtrValue = LMemI->getPointerOperand();
4070 if (L->isLoopInvariant(PtrValue))
4071 continue;
4072
4073 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4074 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4075 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4076 continue;
4077
4078 // FIXME? We could take pairing of unrolled load copies into account
4079 // by looking at the AddRec, but we would probably have to limit this
4080 // to loops with no stores or other memory optimization barriers.
4081 ++StridedLoads;
4082 // We've seen enough strided loads that seeing more won't make a
4083 // difference.
4084 if (StridedLoads > MaxStridedLoads / 2)
4085 return StridedLoads;
4086 }
4087 }
4088 return StridedLoads;
4089 };
4090
4091 int StridedLoads = countStridedLoads(L, SE);
4092 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4093 << " strided loads\n");
4094 // Pick the largest power of 2 unroll count that won't result in too many
4095 // strided loads.
4096 if (StridedLoads) {
4097 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4098 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4099 << UP.MaxCount << '\n');
4100 }
4101}
4102
4103/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4104/// OOO engine's wide instruction window and various predictors.
4105static void
4109 // Limit loops with structure that is highly likely to benefit from runtime
4110 // unrolling; that is we exclude outer loops, loops with multiple exits and
4111 // many blocks (i.e. likely with complex control flow). Note that the
4112 // heuristics here may be overly conservative and we err on the side of
4113 // avoiding runtime unrolling rather than unroll excessively. They are all
4114 // subject to further refinement.
4115 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4116 return;
4117
4118 const SCEV *BTC = SE.getBackedgeTakenCount(L);
4119 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4120 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4121 SE.getSmallConstantMaxTripCount(L) <= 32))
4122 return;
4123 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4124 return;
4125
4126 int64_t Size = 0;
4127 for (auto *BB : L->getBlocks()) {
4128 for (auto &I : *BB) {
4129 if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4130 return;
4131 SmallVector<const Value *, 4> Operands(I.operand_values());
4132 Size +=
4134 }
4135 }
4136
4137 // Limit to loops with trip counts that are cheap to expand.
4138 UP.SCEVExpansionBudget = 1;
4139
4140 // Try to unroll small, single block loops, if they have load/store
4141 // dependencies, to expose more parallel memory access streams.
4142 BasicBlock *Header = L->getHeader();
4143 if (Header == L->getLoopLatch()) {
4144 if (Size > 8)
4145 return;
4146
4147 SmallPtrSet<Value *, 8> LoadedValues;
4149 for (auto *BB : L->blocks()) {
4150 for (auto &I : *BB) {
4152 if (!Ptr)
4153 continue;
4154 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4155 if (SE.isLoopInvariant(PtrSCEV, L))
4156 continue;
4157 if (isa<LoadInst>(&I))
4158 LoadedValues.insert(&I);
4159 else
4160 Stores.push_back(cast<StoreInst>(&I));
4161 }
4162 }
4163
4164 // Try to find an unroll count that maximizes the use of the instruction
4165 // window, i.e. trying to fetch as many instructions per cycle as possible.
4166 unsigned MaxInstsPerLine = 16;
4167 unsigned UC = 1;
4168 unsigned BestUC = 1;
4169 unsigned SizeWithBestUC = BestUC * Size;
4170 while (UC <= 8) {
4171 unsigned SizeWithUC = UC * Size;
4172 if (SizeWithUC > 48)
4173 break;
4174 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4175 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4176 BestUC = UC;
4177 SizeWithBestUC = BestUC * Size;
4178 }
4179 UC++;
4180 }
4181
4182 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4183 return LoadedValues.contains(SI->getOperand(0));
4184 }))
4185 return;
4186
4187 UP.Runtime = true;
4188 UP.DefaultUnrollRuntimeCount = BestUC;
4189 return;
4190 }
4191
4192 // Try to runtime-unroll loops with early-continues depending on loop-varying
4193 // loads; this helps with branch-prediction for the early-continues.
4194 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4195 auto *Latch = L->getLoopLatch();
4197 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4198 none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4199 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
4200 return;
4201
4202 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4203 [&](Instruction *I, unsigned Depth) -> bool {
4204 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4205 return false;
4206
4207 if (isa<LoadInst>(I))
4208 return true;
4209
4210 return any_of(I->operands(), [&](Value *V) {
4211 auto *I = dyn_cast<Instruction>(V);
4212 return I && DependsOnLoopLoad(I, Depth + 1);
4213 });
4214 };
4215 CmpPredicate Pred;
4216 Instruction *I;
4217 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4218 m_Value())) &&
4219 DependsOnLoopLoad(I, 0)) {
4220 UP.Runtime = true;
4221 }
4222}
4223
4227 // Enable partial unrolling and runtime unrolling.
4228 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4229
4230 UP.UpperBound = true;
4231
4232 // For inner loop, it is more likely to be a hot one, and the runtime check
4233 // can be promoted out from LICM pass, so the overhead is less, let's try
4234 // a larger threshold to unroll more loops.
4235 if (L->getLoopDepth() > 1)
4236 UP.PartialThreshold *= 2;
4237
4238 // Disable partial & runtime unrolling on -Os.
4240
4241 // Apply subtarget-specific unrolling preferences.
4242 switch (ST->getProcFamily()) {
4243 case AArch64Subtarget::AppleA14:
4244 case AArch64Subtarget::AppleA15:
4245 case AArch64Subtarget::AppleA16:
4246 case AArch64Subtarget::AppleM4:
4247 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4248 break;
4249 case AArch64Subtarget::Falkor:
4252 break;
4253 default:
4254 break;
4255 }
4256
4257 // Scan the loop: don't unroll loops with calls as this could prevent
4258 // inlining. Don't unroll vector loops either, as they don't benefit much from
4259 // unrolling.
4260 for (auto *BB : L->getBlocks()) {
4261 for (auto &I : *BB) {
4262 // Don't unroll vectorised loop.
4263 if (I.getType()->isVectorTy())
4264 return;
4265
4266 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4267 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4268 if (!isLoweredToCall(F))
4269 continue;
4270 }
4271 return;
4272 }
4273 }
4274 }
4275
4276 // Enable runtime unrolling for in-order models
4277 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4278 // checking for that case, we can ensure that the default behaviour is
4279 // unchanged
4281 !ST->getSchedModel().isOutOfOrder()) {
4282 UP.Runtime = true;
4283 UP.Partial = true;
4284 UP.UnrollRemainder = true;
4286
4287 UP.UnrollAndJam = true;
4289 }
4290}
4291
4295}
4296
4298 Type *ExpectedType) {
4299 switch (Inst->getIntrinsicID()) {
4300 default:
4301 return nullptr;
4302 case Intrinsic::aarch64_neon_st2:
4303 case Intrinsic::aarch64_neon_st3:
4304 case Intrinsic::aarch64_neon_st4: {
4305 // Create a struct type
4306 StructType *ST = dyn_cast<StructType>(ExpectedType);
4307 if (!ST)
4308 return nullptr;
4309 unsigned NumElts = Inst->arg_size() - 1;
4310 if (ST->getNumElements() != NumElts)
4311 return nullptr;
4312 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4313 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4314 return nullptr;
4315 }
4316 Value *Res = PoisonValue::get(ExpectedType);
4317 IRBuilder<> Builder(Inst);
4318 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4319 Value *L = Inst->getArgOperand(i);
4320 Res = Builder.CreateInsertValue(Res, L, i);
4321 }
4322 return Res;
4323 }
4324 case Intrinsic::aarch64_neon_ld2:
4325 case Intrinsic::aarch64_neon_ld3:
4326 case Intrinsic::aarch64_neon_ld4:
4327 if (Inst->getType() == ExpectedType)
4328 return Inst;
4329 return nullptr;
4330 }
4331}
4332
4334 MemIntrinsicInfo &Info) {
4335 switch (Inst->getIntrinsicID()) {
4336 default:
4337 break;
4338 case Intrinsic::aarch64_neon_ld2:
4339 case Intrinsic::aarch64_neon_ld3:
4340 case Intrinsic::aarch64_neon_ld4:
4341 Info.ReadMem = true;
4342 Info.WriteMem = false;
4343 Info.PtrVal = Inst->getArgOperand(0);
4344 break;
4345 case Intrinsic::aarch64_neon_st2:
4346 case Intrinsic::aarch64_neon_st3:
4347 case Intrinsic::aarch64_neon_st4:
4348 Info.ReadMem = false;
4349 Info.WriteMem = true;
4350 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
4351 break;
4352 }
4353
4354 switch (Inst->getIntrinsicID()) {
4355 default:
4356 return false;
4357 case Intrinsic::aarch64_neon_ld2:
4358 case Intrinsic::aarch64_neon_st2:
4359 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4360 break;
4361 case Intrinsic::aarch64_neon_ld3:
4362 case Intrinsic::aarch64_neon_st3:
4363 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4364 break;
4365 case Intrinsic::aarch64_neon_ld4:
4366 case Intrinsic::aarch64_neon_st4:
4367 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4368 break;
4369 }
4370 return true;
4371}
4372
4373/// See if \p I should be considered for address type promotion. We check if \p
4374/// I is a sext with right type and used in memory accesses. If it used in a
4375/// "complex" getelementptr, we allow it to be promoted without finding other
4376/// sext instructions that sign extended the same initial value. A getelementptr
4377/// is considered as "complex" if it has more than 2 operands.
4379 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
4380 bool Considerable = false;
4381 AllowPromotionWithoutCommonHeader = false;
4382 if (!isa<SExtInst>(&I))
4383 return false;
4384 Type *ConsideredSExtType =
4385 Type::getInt64Ty(I.getParent()->getParent()->getContext());
4386 if (I.getType() != ConsideredSExtType)
4387 return false;
4388 // See if the sext is the one with the right type and used in at least one
4389 // GetElementPtrInst.
4390 for (const User *U : I.users()) {
4391 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
4392 Considerable = true;
4393 // A getelementptr is considered as "complex" if it has more than 2
4394 // operands. We will promote a SExt used in such complex GEP as we
4395 // expect some computation to be merged if they are done on 64 bits.
4396 if (GEPInst->getNumOperands() > 2) {
4397 AllowPromotionWithoutCommonHeader = true;
4398 break;
4399 }
4400 }
4401 }
4402 return Considerable;
4403}
4404
4406 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
4407 if (!VF.isScalable())
4408 return true;
4409
4410 Type *Ty = RdxDesc.getRecurrenceType();
4412 return false;
4413
4414 switch (RdxDesc.getRecurrenceKind()) {
4415 case RecurKind::Add:
4416 case RecurKind::FAdd:
4417 case RecurKind::And:
4418 case RecurKind::Or:
4419 case RecurKind::Xor:
4420 case RecurKind::SMin:
4421 case RecurKind::SMax:
4422 case RecurKind::UMin:
4423 case RecurKind::UMax:
4424 case RecurKind::FMin:
4425 case RecurKind::FMax:
4426 case RecurKind::FMulAdd:
4427 case RecurKind::IAnyOf:
4428 case RecurKind::FAnyOf:
4429 return true;
4430 default:
4431 return false;
4432 }
4433}
4434
4437 FastMathFlags FMF,
4439 // The code-generator is currently not able to handle scalable vectors
4440 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4441 // it. This change will be removed when code-generation for these types is
4442 // sufficiently reliable.
4443 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4444 if (VTy->getElementCount() == ElementCount::getScalable(1))
4446
4447 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4448
4449 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4450 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
4451
4452 InstructionCost LegalizationCost = 0;
4453 if (LT.first > 1) {
4454 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
4455 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
4456 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
4457 }
4458
4459 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
4460}
4461
4463 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
4464 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4465 InstructionCost LegalizationCost = 0;
4466 if (LT.first > 1) {
4467 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
4468 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
4469 LegalizationCost *= LT.first - 1;
4470 }
4471
4472 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4473 assert(ISD && "Invalid opcode");
4474 // Add the final reduction cost for the legal horizontal reduction
4475 switch (ISD) {
4476 case ISD::ADD:
4477 case ISD::AND:
4478 case ISD::OR:
4479 case ISD::XOR:
4480 case ISD::FADD:
4481 return LegalizationCost + 2;
4482 default:
4484 }
4485}
4486
4489 std::optional<FastMathFlags> FMF,
4491 // The code-generator is currently not able to handle scalable vectors
4492 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4493 // it. This change will be removed when code-generation for these types is
4494 // sufficiently reliable.
4495 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4496 if (VTy->getElementCount() == ElementCount::getScalable(1))
4498
4500 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4501 InstructionCost BaseCost =
4502 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4503 // Add on extra cost to reflect the extra overhead on some CPUs. We still
4504 // end up vectorizing for more computationally intensive loops.
4505 return BaseCost + FixedVTy->getNumElements();
4506 }
4507
4508 if (Opcode != Instruction::FAdd)
4510
4511 auto *VTy = cast<ScalableVectorType>(ValTy);
4513 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
4514 Cost *= getMaxNumElements(VTy->getElementCount());
4515 return Cost;
4516 }
4517
4518 if (isa<ScalableVectorType>(ValTy))
4519 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
4520
4521 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4522 MVT MTy = LT.second;
4523 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4524 assert(ISD && "Invalid opcode");
4525
4526 // Horizontal adds can use the 'addv' instruction. We model the cost of these
4527 // instructions as twice a normal vector add, plus 1 for each legalization
4528 // step (LT.first). This is the only arithmetic vector reduction operation for
4529 // which we have an instruction.
4530 // OR, XOR and AND costs should match the codegen from:
4531 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
4532 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4533 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4534 static const CostTblEntry CostTblNoPairwise[]{
4535 {ISD::ADD, MVT::v8i8, 2},
4536 {ISD::ADD, MVT::v16i8, 2},
4537 {ISD::ADD, MVT::v4i16, 2},
4538 {ISD::ADD, MVT::v8i16, 2},
4539 {ISD::ADD, MVT::v4i32, 2},
4540 {ISD::ADD, MVT::v2i64, 2},
4541 {ISD::OR, MVT::v8i8, 15},
4542 {ISD::OR, MVT::v16i8, 17},
4543 {ISD::OR, MVT::v4i16, 7},
4544 {ISD::OR, MVT::v8i16, 9},
4545 {ISD::OR, MVT::v2i32, 3},
4546 {ISD::OR, MVT::v4i32, 5},
4547 {ISD::OR, MVT::v2i64, 3},
4548 {ISD::XOR, MVT::v8i8, 15},
4549 {ISD::XOR, MVT::v16i8, 17},
4550 {ISD::XOR, MVT::v4i16, 7},
4551 {ISD::XOR, MVT::v8i16, 9},
4552 {ISD::XOR, MVT::v2i32, 3},
4553 {ISD::XOR, MVT::v4i32, 5},
4554 {ISD::XOR, MVT::v2i64, 3},
4555 {ISD::AND, MVT::v8i8, 15},
4556 {ISD::AND, MVT::v16i8, 17},
4557 {ISD::AND, MVT::v4i16, 7},
4558 {ISD::AND, MVT::v8i16, 9},
4559 {ISD::AND, MVT::v2i32, 3},
4560 {ISD::AND, MVT::v4i32, 5},
4561 {ISD::AND, MVT::v2i64, 3},
4562 };
4563 switch (ISD) {
4564 default:
4565 break;
4566 case ISD::FADD:
4567 if (Type *EltTy = ValTy->getScalarType();
4568 // FIXME: For half types without fullfp16 support, this could extend and
4569 // use a fp32 faddp reduction but current codegen unrolls.
4570 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4571 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4572 const unsigned NElts = MTy.getVectorNumElements();
4573 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
4574 isPowerOf2_32(NElts))
4575 // Reduction corresponding to series of fadd instructions is lowered to
4576 // series of faddp instructions. faddp has latency/throughput that
4577 // matches fadd instruction and hence, every faddp instruction can be
4578 // considered to have a relative cost = 1 with
4579 // CostKind = TCK_RecipThroughput.
4580 // An faddp will pairwise add vector elements, so the size of input
4581 // vector reduces by half every time, requiring
4582 // #(faddp instructions) = log2_32(NElts).
4583 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
4584 }
4585 break;
4586 case ISD::ADD:
4587 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4588 return (LT.first - 1) + Entry->Cost;
4589 break;
4590 case ISD::XOR:
4591 case ISD::AND:
4592 case ISD::OR:
4593 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4594 if (!Entry)
4595 break;
4596 auto *ValVTy = cast<FixedVectorType>(ValTy);
4597 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4598 isPowerOf2_32(ValVTy->getNumElements())) {
4599 InstructionCost ExtraCost = 0;
4600 if (LT.first != 1) {
4601 // Type needs to be split, so there is an extra cost of LT.first - 1
4602 // arithmetic ops.
4603 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4604 MTy.getVectorNumElements());
4605 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4606 ExtraCost *= LT.first - 1;
4607 }
4608 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4609 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4610 return Cost + ExtraCost;
4611 }
4612 break;
4613 }
4614 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4615}
4616
4618 static const CostTblEntry ShuffleTbl[] = {
4619 { TTI::SK_Splice, MVT::nxv16i8, 1 },
4620 { TTI::SK_Splice, MVT::nxv8i16, 1 },
4621 { TTI::SK_Splice, MVT::nxv4i32, 1 },
4622 { TTI::SK_Splice, MVT::nxv2i64, 1 },
4623 { TTI::SK_Splice, MVT::nxv2f16, 1 },
4624 { TTI::SK_Splice, MVT::nxv4f16, 1 },
4625 { TTI::SK_Splice, MVT::nxv8f16, 1 },
4626 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4627 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4628 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4629 { TTI::SK_Splice, MVT::nxv2f32, 1 },
4630 { TTI::SK_Splice, MVT::nxv4f32, 1 },
4631 { TTI::SK_Splice, MVT::nxv2f64, 1 },
4632 };
4633
4634 // The code-generator is currently not able to handle scalable vectors
4635 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4636 // it. This change will be removed when code-generation for these types is
4637 // sufficiently reliable.
4640
4641 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4642 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4644 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4645 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4646 : LT.second;
4647 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4648 InstructionCost LegalizationCost = 0;
4649 if (Index < 0) {
4650 LegalizationCost =
4651 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4653 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4655 }
4656
4657 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4658 // Cost performed on a promoted type.
4659 if (LT.second.getScalarType() == MVT::i1) {
4660 LegalizationCost +=
4661 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4663 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4665 }
4666 const auto *Entry =
4667 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4668 assert(Entry && "Illegal Type for Splice");
4669 LegalizationCost += Entry->Cost;
4670 return LegalizationCost * LT.first;
4671}
4672
4674 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
4677 std::optional<unsigned> BinOp) const {
4680
4681 if (Opcode != Instruction::Add)
4682 return Invalid;
4683
4684 if (InputTypeA != InputTypeB)
4685 return Invalid;
4686
4687 EVT InputEVT = EVT::getEVT(InputTypeA);
4688 EVT AccumEVT = EVT::getEVT(AccumType);
4689
4690 if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
4691 return Invalid;
4692 if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
4693 return Invalid;
4694
4695 if (InputEVT == MVT::i8) {
4696 switch (VF.getKnownMinValue()) {
4697 default:
4698 return Invalid;
4699 case 8:
4700 if (AccumEVT == MVT::i32)
4701 Cost *= 2;
4702 else if (AccumEVT != MVT::i64)
4703 return Invalid;
4704 break;
4705 case 16:
4706 if (AccumEVT == MVT::i64)
4707 Cost *= 2;
4708 else if (AccumEVT != MVT::i32)
4709 return Invalid;
4710 break;
4711 }
4712 } else if (InputEVT == MVT::i16) {
4713 // FIXME: Allow i32 accumulator but increase cost, as we would extend
4714 // it to i64.
4715 if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
4716 return Invalid;
4717 } else
4718 return Invalid;
4719
4720 // AArch64 supports lowering mixed extensions to a usdot but only if the
4721 // i8mm or sve/streaming features are available.
4722 if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None ||
4723 (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
4725 return Invalid;
4726
4727 if (!BinOp || *BinOp != Instruction::Mul)
4728 return Invalid;
4729
4730 return Cost;
4731}
4732
4735 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4736 ArrayRef<const Value *> Args, const Instruction *CxtI) {
4737 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4738
4739 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4740 // into smaller vectors and sum the cost of each shuffle.
4741 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4742 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4743 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4744
4745 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4746 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4747 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4748 // cost than just the load.
4749 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4752 return std::max<InstructionCost>(1, LT.first / 4);
4753
4754 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4755 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4756 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4757 // cost than just the store.
4758 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4760 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4762 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4763 return LT.first;
4764
4765 unsigned TpNumElts = Mask.size();
4766 unsigned LTNumElts = LT.second.getVectorNumElements();
4767 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4768 VectorType *NTp =
4769 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4771 for (unsigned N = 0; N < NumVecs; N++) {
4772 SmallVector<int> NMask;
4773 // Split the existing mask into chunks of size LTNumElts. Track the source
4774 // sub-vectors to ensure the result has at most 2 inputs.
4775 unsigned Source1, Source2;
4776 unsigned NumSources = 0;
4777 for (unsigned E = 0; E < LTNumElts; E++) {
4778 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4780 if (MaskElt < 0) {
4782 continue;
4783 }
4784
4785 // Calculate which source from the input this comes from and whether it
4786 // is new to us.
4787 unsigned Source = MaskElt / LTNumElts;
4788 if (NumSources == 0) {
4789 Source1 = Source;
4790 NumSources = 1;
4791 } else if (NumSources == 1 && Source != Source1) {
4792 Source2 = Source;
4793 NumSources = 2;
4794 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4795 NumSources++;
4796 }
4797
4798 // Add to the new mask. For the NumSources>2 case these are not correct,
4799 // but are only used for the modular lane number.
4800 if (Source == Source1)
4801 NMask.push_back(MaskElt % LTNumElts);
4802 else if (Source == Source2)
4803 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4804 else
4805 NMask.push_back(MaskElt % LTNumElts);
4806 }
4807 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4808 // getShuffleCost. If not then cost it using the worst case as the number
4809 // of element moves into a new vector.
4810 if (NumSources <= 2)
4811 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4813 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4814 else
4815 Cost += LTNumElts;
4816 }
4817 return Cost;
4818 }
4819
4820 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4821 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4822 // A subvector extract can be implemented with an ext (or trivial extract, if
4823 // from lane 0). This currently only handles low or high extracts to prevent
4824 // SLP vectorizer regressions.
4825 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
4826 if (LT.second.is128BitVector() &&
4827 cast<FixedVectorType>(SubTp)->getNumElements() ==
4828 LT.second.getVectorNumElements() / 2) {
4829 if (Index == 0)
4830 return 0;
4831 if (Index == (int)LT.second.getVectorNumElements() / 2)
4832 return 1;
4833 }
4835 }
4836
4837 // Check for broadcast loads, which are supported by the LD1R instruction.
4838 // In terms of code-size, the shuffle vector is free when a load + dup get
4839 // folded into a LD1R. That's what we check and return here. For performance
4840 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4841 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4842 // that we model the load + dup sequence slightly higher because LD1R is a
4843 // high latency instruction.
4844 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4845 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4846 if (IsLoad && LT.second.isVector() &&
4848 LT.second.getVectorElementCount()))
4849 return 0;
4850 }
4851
4852 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4853 // from the perfect shuffle tables.
4854 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4855 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4856 all_of(Mask, [](int E) { return E < 8; }))
4857 return getPerfectShuffleCost(Mask);
4858
4859 // Check for identity masks, which we can treat as free.
4860 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4861 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4862 all_of(enumerate(Mask), [](const auto &M) {
4863 return M.value() < 0 || M.value() == (int)M.index();
4864 }))
4865 return 0;
4866
4867 // Check for other shuffles that are not SK_ kinds but we have native
4868 // instructions for, for example ZIP and UZP.
4869 unsigned Unused;
4870 if (LT.second.isFixedLengthVector() &&
4871 LT.second.getVectorNumElements() == Mask.size() &&
4872 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4873 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4874 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4875 // Check for non-zero lane splats
4876 all_of(drop_begin(Mask),
4877 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4878 return 1;
4879
4880 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4881 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4882 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4883 static const CostTblEntry ShuffleTbl[] = {
4884 // Broadcast shuffle kinds can be performed with 'dup'.
4885 {TTI::SK_Broadcast, MVT::v8i8, 1},
4886 {TTI::SK_Broadcast, MVT::v16i8, 1},
4887 {TTI::SK_Broadcast, MVT::v4i16, 1},
4888 {TTI::SK_Broadcast, MVT::v8i16, 1},
4889 {TTI::SK_Broadcast, MVT::v2i32, 1},
4890 {TTI::SK_Broadcast, MVT::v4i32, 1},
4891 {TTI::SK_Broadcast, MVT::v2i64, 1},
4892 {TTI::SK_Broadcast, MVT::v4f16, 1},
4893 {TTI::SK_Broadcast, MVT::v8f16, 1},
4894 {TTI::SK_Broadcast, MVT::v2f32, 1},
4895 {TTI::SK_Broadcast, MVT::v4f32, 1},
4896 {TTI::SK_Broadcast, MVT::v2f64, 1},
4897 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4898 // 'zip1/zip2' instructions.
4899 {TTI::SK_Transpose, MVT::v8i8, 1},
4900 {TTI::SK_Transpose, MVT::v16i8, 1},
4901 {TTI::SK_Transpose, MVT::v4i16, 1},
4902 {TTI::SK_Transpose, MVT::v8i16, 1},
4903 {TTI::SK_Transpose, MVT::v2i32, 1},
4904 {TTI::SK_Transpose, MVT::v4i32, 1},
4905 {TTI::SK_Transpose, MVT::v2i64, 1},
4906 {TTI::SK_Transpose, MVT::v4f16, 1},
4907 {TTI::SK_Transpose, MVT::v8f16, 1},
4908 {TTI::SK_Transpose, MVT::v2f32, 1},
4909 {TTI::SK_Transpose, MVT::v4f32, 1},
4910 {TTI::SK_Transpose, MVT::v2f64, 1},
4911 // Select shuffle kinds.
4912 // TODO: handle vXi8/vXi16.
4913 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4914 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4915 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4916 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4917 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4918 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4919 // PermuteSingleSrc shuffle kinds.
4920 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4921 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4922 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4923 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4924 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4925 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4926 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4927 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4928 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4929 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4930 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4931 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4932 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4933 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4934 // Reverse can be lowered with `rev`.
4935 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4936 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4937 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4938 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4939 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4940 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4941 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4942 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4943 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4944 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4945 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4946 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4947 // Splice can all be lowered as `ext`.
4948 {TTI::SK_Splice, MVT::v2i32, 1},
4949 {TTI::SK_Splice, MVT::v4i32, 1},
4950 {TTI::SK_Splice, MVT::v2i64, 1},
4951 {TTI::SK_Splice, MVT::v2f32, 1},
4952 {TTI::SK_Splice, MVT::v4f32, 1},
4953 {TTI::SK_Splice, MVT::v2f64, 1},
4954 {TTI::SK_Splice, MVT::v8f16, 1},
4955 {TTI::SK_Splice, MVT::v8bf16, 1},
4956 {TTI::SK_Splice, MVT::v8i16, 1},
4957 {TTI::SK_Splice, MVT::v16i8, 1},
4958 {TTI::SK_Splice, MVT::v4bf16, 1},
4959 {TTI::SK_Splice, MVT::v4f16, 1},
4960 {TTI::SK_Splice, MVT::v4i16, 1},
4961 {TTI::SK_Splice, MVT::v8i8, 1},
4962 // Broadcast shuffle kinds for scalable vectors
4963 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4964 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4965 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4966 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4967 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4968 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4969 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4970 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4971 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4972 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4973 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4974 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4975 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4976 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4977 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4978 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4979 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4980 // Handle the cases for vector.reverse with scalable vectors
4981 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4982 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4983 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4984 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4985 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4986 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4987 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4988 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4989 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4990 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4991 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4992 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4993 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4994 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4995 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4996 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4997 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4998 };
4999 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
5000 return LT.first * Entry->Cost;
5001 }
5002
5003 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
5004 return getSpliceCost(Tp, Index);
5005
5006 // Inserting a subvector can often be done with either a D, S or H register
5007 // move, so long as the inserted vector is "aligned".
5008 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5009 LT.second.getSizeInBits() <= 128 && SubTp) {
5010 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
5011 if (SubLT.second.isVector()) {
5012 int NumElts = LT.second.getVectorNumElements();
5013 int NumSubElts = SubLT.second.getVectorNumElements();
5014 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
5015 return SubLT.first;
5016 }
5017 }
5018
5019 // Restore optimal kind.
5020 if (IsExtractSubvector)
5022 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
5023 CxtI);
5024}
5025
5028 const auto &Strides = DenseMap<Value *, const SCEV *>();
5029 for (BasicBlock *BB : TheLoop->blocks()) {
5030 // Scan the instructions in the block and look for addresses that are
5031 // consecutive and decreasing.
5032 for (Instruction &I : *BB) {
5033 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
5035 Type *AccessTy = getLoadStoreType(&I);
5036 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
5037 /*ShouldCheckWrap=*/false)
5038 .value_or(0) < 0)
5039 return true;
5040 }
5041 }
5042 }
5043 return false;
5044}
5045
5049 return ST->useFixedOverScalableIfEqualCost();
5050}
5051
5053 return ST->getEpilogueVectorizationMinVF();
5054}
5055
5057 if (!ST->hasSVE())
5058 return false;
5059
5060 // We don't currently support vectorisation with interleaving for SVE - with
5061 // such loops we're better off not using tail-folding. This gives us a chance
5062 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
5063 if (TFI->IAI->hasGroups())
5064 return false;
5065
5067 if (TFI->LVL->getReductionVars().size())
5068 Required |= TailFoldingOpts::Reductions;
5069 if (TFI->LVL->getFixedOrderRecurrences().size())
5070 Required |= TailFoldingOpts::Recurrences;
5071
5072 // We call this to discover whether any load/store pointers in the loop have
5073 // negative strides. This will require extra work to reverse the loop
5074 // predicate, which may be expensive.
5077 Required |= TailFoldingOpts::Reverse;
5078 if (Required == TailFoldingOpts::Disabled)
5079 Required |= TailFoldingOpts::Simple;
5080
5082 Required))
5083 return false;
5084
5085 // Don't tail-fold for tight loops where we would be better off interleaving
5086 // with an unpredicated loop.
5087 unsigned NumInsns = 0;
5088 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5089 NumInsns += BB->sizeWithoutDebug();
5090 }
5091
5092 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5093 return NumInsns >= SVETailFoldInsnThreshold;
5094}
5095
5098 StackOffset BaseOffset, bool HasBaseReg,
5099 int64_t Scale, unsigned AddrSpace) const {
5100 // Scaling factors are not free at all.
5101 // Operands | Rt Latency
5102 // -------------------------------------------
5103 // Rt, [Xn, Xm] | 4
5104 // -------------------------------------------
5105 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
5106 // Rt, [Xn, Wm, <extend> #imm] |
5108 AM.BaseGV = BaseGV;
5109 AM.BaseOffs = BaseOffset.getFixed();
5110 AM.HasBaseReg = HasBaseReg;
5111 AM.Scale = Scale;
5112 AM.ScalableOffset = BaseOffset.getScalable();
5113 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
5114 // Scale represents reg2 * scale, thus account for 1 if
5115 // it is not equal to 0 or 1.
5116 return AM.Scale != 0 && AM.Scale != 1;
5117 return -1;
5118}
5119
5122 // For the binary operators (e.g. or) we need to be more careful than
5123 // selects, here we only transform them if they are already at a natural
5124 // break point in the code - the end of a block with an unconditional
5125 // terminator.
5126 if (I->getOpcode() == Instruction::Or &&
5127 isa<BranchInst>(I->getNextNode()) &&
5128 cast<BranchInst>(I->getNextNode())->isUnconditional())
5129 return true;
5130
5131 if (I->getOpcode() == Instruction::Add ||
5132 I->getOpcode() == Instruction::Sub)
5133 return true;
5134 }
5136}
5137
5139 const TargetTransformInfo::LSRCost &C2) {
5140 // AArch64 specific here is adding the number of instructions to the
5141 // comparison (though not as the first consideration, as some targets do)
5142 // along with changing the priority of the base additions.
5143 // TODO: Maybe a more nuanced tradeoff between instruction count
5144 // and number of registers? To be investigated at a later date.
5145 if (EnableLSRCostOpt)
5146 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
5147 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5148 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
5149 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5150
5152}
5153
5154static bool isSplatShuffle(Value *V) {
5155 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
5156 return all_equal(Shuf->getShuffleMask());
5157 return false;
5158}
5159
5160/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5161/// or upper half of the vector elements.
5162static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5163 bool AllowSplat = false) {
5164 // Scalable types can't be extract shuffle vectors.
5165 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5166 return false;
5167
5168 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5169 auto *FullTy = FullV->getType();
5170 auto *HalfTy = HalfV->getType();
5171 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5172 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5173 };
5174
5175 auto extractHalf = [](Value *FullV, Value *HalfV) {
5176 auto *FullVT = cast<FixedVectorType>(FullV->getType());
5177 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5178 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5179 };
5180
5181 ArrayRef<int> M1, M2;
5182 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5183 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5184 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5185 return false;
5186
5187 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5188 // it is not checked as an extract below.
5189 if (AllowSplat && isSplatShuffle(Op1))
5190 S1Op1 = nullptr;
5191 if (AllowSplat && isSplatShuffle(Op2))
5192 S2Op1 = nullptr;
5193
5194 // Check that the operands are half as wide as the result and we extract
5195 // half of the elements of the input vectors.
5196 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5197 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5198 return false;
5199
5200 // Check the mask extracts either the lower or upper half of vector
5201 // elements.
5202 int M1Start = 0;
5203 int M2Start = 0;
5204 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5205 if ((S1Op1 &&
5206 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5207 (S2Op1 &&
5208 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5209 return false;
5210
5211 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5212 (M2Start != 0 && M2Start != (NumElements / 2)))
5213 return false;
5214 if (S1Op1 && S2Op1 && M1Start != M2Start)
5215 return false;
5216
5217 return true;
5218}
5219
5220/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5221/// of the vector elements.
5222static bool areExtractExts(Value *Ext1, Value *Ext2) {
5223 auto areExtDoubled = [](Instruction *Ext) {
5224 return Ext->getType()->getScalarSizeInBits() ==
5225 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5226 };
5227
5228 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5229 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5230 !areExtDoubled(cast<Instruction>(Ext1)) ||
5231 !areExtDoubled(cast<Instruction>(Ext2)))
5232 return false;
5233
5234 return true;
5235}
5236
5237/// Check if Op could be used with vmull_high_p64 intrinsic.
5239 Value *VectorOperand = nullptr;
5240 ConstantInt *ElementIndex = nullptr;
5241 return match(Op, m_ExtractElt(m_Value(VectorOperand),
5242 m_ConstantInt(ElementIndex))) &&
5243 ElementIndex->getValue() == 1 &&
5244 isa<FixedVectorType>(VectorOperand->getType()) &&
5245 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5246}
5247
5248/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5249static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5251}
5252
5254 // Restrict ourselves to the form CodeGenPrepare typically constructs.
5255 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5256 if (!GEP || GEP->getNumOperands() != 2)
5257 return false;
5258
5259 Value *Base = GEP->getOperand(0);
5260 Value *Offsets = GEP->getOperand(1);
5261
5262 // We only care about scalar_base+vector_offsets.
5263 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5264 return false;
5265
5266 // Sink extends that would allow us to use 32-bit offset vectors.
5267 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5268 auto *OffsetsInst = cast<Instruction>(Offsets);
5269 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5270 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5271 Ops.push_back(&GEP->getOperandUse(1));
5272 }
5273
5274 // Sink the GEP.
5275 return true;
5276}
5277
5278/// We want to sink following cases:
5279/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5280/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5282 if (match(Op, m_VScale()))
5283 return true;
5284 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5286 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5287 return true;
5288 }
5289 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5291 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5292 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5293 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5294 return true;
5295 }
5296 return false;
5297}
5298
5299/// Check if sinking \p I's operands to I's basic block is profitable, because
5300/// the operands can be folded into a target instruction, e.g.
5301/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5303 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5304 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5305 switch (II->getIntrinsicID()) {
5306 case Intrinsic::aarch64_neon_smull:
5307 case Intrinsic::aarch64_neon_umull:
5308 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
5309 /*AllowSplat=*/true)) {
5310 Ops.push_back(&II->getOperandUse(0));
5311 Ops.push_back(&II->getOperandUse(1));
5312 return true;
5313 }
5314 [[fallthrough]];
5315
5316 case Intrinsic::fma:
5317 case Intrinsic::fmuladd:
5318 if (isa<VectorType>(I->getType()) &&
5319 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5320 !ST->hasFullFP16())
5321 return false;
5322 [[fallthrough]];
5323 case Intrinsic::aarch64_neon_sqdmull:
5324 case Intrinsic::aarch64_neon_sqdmulh:
5325 case Intrinsic::aarch64_neon_sqrdmulh:
5326 // Sink splats for index lane variants
5327 if (isSplatShuffle(II->getOperand(0)))
5328 Ops.push_back(&II->getOperandUse(0));
5329 if (isSplatShuffle(II->getOperand(1)))
5330 Ops.push_back(&II->getOperandUse(1));
5331 return !Ops.empty();
5332 case Intrinsic::aarch64_neon_fmlal:
5333 case Intrinsic::aarch64_neon_fmlal2:
5334 case Intrinsic::aarch64_neon_fmlsl:
5335 case Intrinsic::aarch64_neon_fmlsl2:
5336 // Sink splats for index lane variants
5337 if (isSplatShuffle(II->getOperand(1)))
5338 Ops.push_back(&II->getOperandUse(1));
5339 if (isSplatShuffle(II->getOperand(2)))
5340 Ops.push_back(&II->getOperandUse(2));
5341 return !Ops.empty();
5342 case Intrinsic::aarch64_sve_ptest_first:
5343 case Intrinsic::aarch64_sve_ptest_last:
5344 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
5345 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
5346 Ops.push_back(&II->getOperandUse(0));
5347 return !Ops.empty();
5348 case Intrinsic::aarch64_sme_write_horiz:
5349 case Intrinsic::aarch64_sme_write_vert:
5350 case Intrinsic::aarch64_sme_writeq_horiz:
5351 case Intrinsic::aarch64_sme_writeq_vert: {
5352 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
5353 if (!Idx || Idx->getOpcode() != Instruction::Add)
5354 return false;
5355 Ops.push_back(&II->getOperandUse(1));
5356 return true;
5357 }
5358 case Intrinsic::aarch64_sme_read_horiz:
5359 case Intrinsic::aarch64_sme_read_vert:
5360 case Intrinsic::aarch64_sme_readq_horiz:
5361 case Intrinsic::aarch64_sme_readq_vert:
5362 case Intrinsic::aarch64_sme_ld1b_vert:
5363 case Intrinsic::aarch64_sme_ld1h_vert:
5364 case Intrinsic::aarch64_sme_ld1w_vert:
5365 case Intrinsic::aarch64_sme_ld1d_vert:
5366 case Intrinsic::aarch64_sme_ld1q_vert:
5367 case Intrinsic::aarch64_sme_st1b_vert:
5368 case Intrinsic::aarch64_sme_st1h_vert:
5369 case Intrinsic::aarch64_sme_st1w_vert:
5370 case Intrinsic::aarch64_sme_st1d_vert:
5371 case Intrinsic::aarch64_sme_st1q_vert:
5372 case Intrinsic::aarch64_sme_ld1b_horiz:
5373 case Intrinsic::aarch64_sme_ld1h_horiz:
5374 case Intrinsic::aarch64_sme_ld1w_horiz:
5375 case Intrinsic::aarch64_sme_ld1d_horiz:
5376 case Intrinsic::aarch64_sme_ld1q_horiz:
5377 case Intrinsic::aarch64_sme_st1b_horiz:
5378 case Intrinsic::aarch64_sme_st1h_horiz:
5379 case Intrinsic::aarch64_sme_st1w_horiz:
5380 case Intrinsic::aarch64_sme_st1d_horiz:
5381 case Intrinsic::aarch64_sme_st1q_horiz: {
5382 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
5383 if (!Idx || Idx->getOpcode() != Instruction::Add)
5384 return false;
5385 Ops.push_back(&II->getOperandUse(3));
5386 return true;
5387 }
5388 case Intrinsic::aarch64_neon_pmull:
5389 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
5390 return false;
5391 Ops.push_back(&II->getOperandUse(0));
5392 Ops.push_back(&II->getOperandUse(1));
5393 return true;
5394 case Intrinsic::aarch64_neon_pmull64:
5395 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
5396 II->getArgOperand(1)))
5397 return false;
5398 Ops.push_back(&II->getArgOperandUse(0));
5399 Ops.push_back(&II->getArgOperandUse(1));
5400 return true;
5401 case Intrinsic::masked_gather:
5402 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
5403 return false;
5404 Ops.push_back(&II->getArgOperandUse(0));
5405 return true;
5406 case Intrinsic::masked_scatter:
5407 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
5408 return false;
5409 Ops.push_back(&II->getArgOperandUse(1));
5410 return true;
5411 default:
5412 return false;
5413 }
5414 }
5415
5416 auto ShouldSinkCondition = [](Value *Cond) -> bool {
5417 auto *II = dyn_cast<IntrinsicInst>(Cond);
5418 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5419 isa<ScalableVectorType>(II->getOperand(0)->getType());
5420 };
5421
5422 switch (I->getOpcode()) {
5423 case Instruction::GetElementPtr:
5424 case Instruction::Add:
5425 case Instruction::Sub:
5426 // Sink vscales closer to uses for better isel
5427 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
5428 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
5429 Ops.push_back(&I->getOperandUse(Op));
5430 return true;
5431 }
5432 }
5433 break;
5434 case Instruction::Select: {
5435 if (!ShouldSinkCondition(I->getOperand(0)))
5436 return false;
5437
5438 Ops.push_back(&I->getOperandUse(0));
5439 return true;
5440 }
5441 case Instruction::Br: {
5442 if (cast<BranchInst>(I)->isUnconditional())
5443 return false;
5444
5445 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5446 return false;
5447
5448 Ops.push_back(&I->getOperandUse(0));
5449 return true;
5450 }
5451 default:
5452 break;
5453 }
5454
5455 if (!I->getType()->isVectorTy())
5456 return false;
5457
5458 switch (I->getOpcode()) {
5459 case Instruction::Sub:
5460 case Instruction::Add: {
5461 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
5462 return false;
5463
5464 // If the exts' operands extract either the lower or upper elements, we
5465 // can sink them too.
5466 auto Ext1 = cast<Instruction>(I->getOperand(0));
5467 auto Ext2 = cast<Instruction>(I->getOperand(1));
5468 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
5469 Ops.push_back(&Ext1->getOperandUse(0));
5470 Ops.push_back(&Ext2->getOperandUse(0));
5471 }
5472
5473 Ops.push_back(&I->getOperandUse(0));
5474 Ops.push_back(&I->getOperandUse(1));
5475
5476 return true;
5477 }
5478 case Instruction::Or: {
5479 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
5480 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
5481 if (ST->hasNEON()) {
5482 Instruction *OtherAnd, *IA, *IB;
5483 Value *MaskValue;
5484 // MainAnd refers to And instruction that has 'Not' as one of its operands
5485 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
5486 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
5487 m_Instruction(IA)))))) {
5488 if (match(OtherAnd,
5489 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
5490 Instruction *MainAnd = I->getOperand(0) == OtherAnd
5491 ? cast<Instruction>(I->getOperand(1))
5492 : cast<Instruction>(I->getOperand(0));
5493
5494 // Both Ands should be in same basic block as Or
5495 if (I->getParent() != MainAnd->getParent() ||
5496 I->getParent() != OtherAnd->getParent())
5497 return false;
5498
5499 // Non-mask operands of both Ands should also be in same basic block
5500 if (I->getParent() != IA->getParent() ||
5501 I->getParent() != IB->getParent())
5502 return false;
5503
5504 Ops.push_back(
5505 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
5506 Ops.push_back(&I->getOperandUse(0));
5507 Ops.push_back(&I->getOperandUse(1));
5508
5509 return true;
5510 }
5511 }
5512 }
5513
5514 return false;
5515 }
5516 case Instruction::Mul: {
5517 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5518 auto *Ty = cast<VectorType>(V->getType());
5519 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5520 if (Ty->isScalableTy())
5521 return false;
5522
5523 // Indexed variants of Mul exist for i16 and i32 element types only.
5524 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5525 };
5526
5527 int NumZExts = 0, NumSExts = 0;
5528 for (auto &Op : I->operands()) {
5529 // Make sure we are not already sinking this operand
5530 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
5531 continue;
5532
5533 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5534 auto *Ext = cast<Instruction>(Op);
5535 auto *ExtOp = Ext->getOperand(0);
5536 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5537 Ops.push_back(&Ext->getOperandUse(0));
5538 Ops.push_back(&Op);
5539
5540 if (isa<SExtInst>(Ext))
5541 NumSExts++;
5542 else
5543 NumZExts++;
5544
5545 continue;
5546 }
5547
5548 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5549 if (!Shuffle)
5550 continue;
5551
5552 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
5553 // operand and the s/zext can help create indexed s/umull. This is
5554 // especially useful to prevent i64 mul being scalarized.
5555 if (isSplatShuffle(Shuffle) &&
5556 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
5557 Ops.push_back(&Shuffle->getOperandUse(0));
5558 Ops.push_back(&Op);
5559 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
5560 NumSExts++;
5561 else
5562 NumZExts++;
5563 continue;
5564 }
5565
5566 Value *ShuffleOperand = Shuffle->getOperand(0);
5567 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
5568 if (!Insert)
5569 continue;
5570
5571 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
5572 if (!OperandInstr)
5573 continue;
5574
5575 ConstantInt *ElementConstant =
5576 dyn_cast<ConstantInt>(Insert->getOperand(2));
5577 // Check that the insertelement is inserting into element 0
5578 if (!ElementConstant || !ElementConstant->isZero())
5579 continue;
5580
5581 unsigned Opcode = OperandInstr->getOpcode();
5582 if (Opcode == Instruction::SExt)
5583 NumSExts++;
5584 else if (Opcode == Instruction::ZExt)
5585 NumZExts++;
5586 else {
5587 // If we find that the top bits are known 0, then we can sink and allow
5588 // the backend to generate a umull.
5589 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
5590 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
5591 const DataLayout &DL = I->getDataLayout();
5592 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
5593 continue;
5594 NumZExts++;
5595 }
5596
5597 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
5598 // the And, just to hoist it again back to the load.
5599 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
5600 Ops.push_back(&Insert->getOperandUse(1));
5601 Ops.push_back(&Shuffle->getOperandUse(0));
5602 Ops.push_back(&Op);
5603 }
5604
5605 // It is profitable to sink if we found two of the same type of extends.
5606 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5607 return true;
5608
5609 // Otherwise, see if we should sink splats for indexed variants.
5610 if (!ShouldSinkSplatForIndexedVariant(I))
5611 return false;
5612
5613 Ops.clear();
5614 if (isSplatShuffle(I->getOperand(0)))
5615 Ops.push_back(&I->getOperandUse(0));
5616 if (isSplatShuffle(I->getOperand(1)))
5617 Ops.push_back(&I->getOperandUse(1));
5618
5619 return !Ops.empty();
5620 }
5621 case Instruction::FMul: {
5622 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5623 if (I->getType()->isScalableTy())
5624 return false;
5625
5626 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5627 !ST->hasFullFP16())
5628 return false;
5629
5630 // Sink splats for index lane variants
5631 if (isSplatShuffle(I->getOperand(0)))
5632 Ops.push_back(&I->getOperandUse(0));
5633 if (isSplatShuffle(I->getOperand(1)))
5634 Ops.push_back(&I->getOperandUse(1));
5635 return !Ops.empty();
5636 }
5637 default:
5638 return false;
5639 }
5640 return false;
5641}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, bool hasInactiveVector)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getEpilogueVectorizationMinVF() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp) const
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
unsigned getEpilogueVectorizationMinVF() const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
uint64_t getFeatureMask(const Function &F) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isMultiversionedFunction(const Function &F) const
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
unsigned countLeadingOnes() const
Definition: APInt.h:1603
void negate()
Negate this APInt in place.
Definition: APInt.h:1450
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:622
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:306
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:218
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isIntPredicate() const
Definition: InstrTypes.h:781
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1672
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition: IRBuilder.h:92
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:578
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1163
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:546
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:563
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2234
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1677
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:566
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:573
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:48
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:694
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:651
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
This instruction constructs a fixed permutation of two input vectors.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:218
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
int getNumOccurrences() const
Definition: CommandLine.h:399
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
uint64_t getFMVPriority(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:931
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1077
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:298
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition: VE.h:376
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:384
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55