LLVM 20.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
28#include <algorithm>
29#include <optional>
30using namespace llvm;
31using namespace llvm::PatternMatch;
32
33#define DEBUG_TYPE "aarch64tti"
34
35static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
36 cl::init(true), cl::Hidden);
37
38static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
40
41static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
42 cl::init(10), cl::Hidden);
43
44static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
45 cl::init(15), cl::Hidden);
46
48 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
50
52 "call-penalty-sm-change", cl::init(5), cl::Hidden,
54 "Penalty of calling a function that requires a change to PSTATE.SM"));
55
57 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
58 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
59
60static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
61 cl::init(true), cl::Hidden);
62
63static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
64 cl::init(true), cl::Hidden);
65
66// A complete guess as to a reasonable cost.
68 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
69 cl::desc("The cost of a histcnt instruction"));
70
72 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
73 cl::desc("The number of instructions to search for a redundant dmb"));
74
75namespace {
76class TailFoldingOption {
77 // These bitfields will only ever be set to something non-zero in operator=,
78 // when setting the -sve-tail-folding option. This option should always be of
79 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
80 // InitialBits is one of (disabled|all|simple). EnableBits represents
81 // additional flags we're enabling, and DisableBits for those flags we're
82 // disabling. The default flag is tracked in the variable NeedsDefault, since
83 // at the time of setting the option we may not know what the default value
84 // for the CPU is.
85 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
86 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
87 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
88
89 // This value needs to be initialised to true in case the user does not
90 // explicitly set the -sve-tail-folding option.
91 bool NeedsDefault = true;
92
93 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
94
95 void setNeedsDefault(bool V) { NeedsDefault = V; }
96
97 void setEnableBit(TailFoldingOpts Bit) {
98 EnableBits |= Bit;
99 DisableBits &= ~Bit;
100 }
101
102 void setDisableBit(TailFoldingOpts Bit) {
103 EnableBits &= ~Bit;
104 DisableBits |= Bit;
105 }
106
107 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
108 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
109
110 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
111 "Initial bits should only include one of "
112 "(disabled|all|simple|default)");
113 Bits = NeedsDefault ? DefaultBits : InitialBits;
114 Bits |= EnableBits;
115 Bits &= ~DisableBits;
116
117 return Bits;
118 }
119
120 void reportError(std::string Opt) {
121 errs() << "invalid argument '" << Opt
122 << "' to -sve-tail-folding=; the option should be of the form\n"
123 " (disabled|all|default|simple)[+(reductions|recurrences"
124 "|reverse|noreductions|norecurrences|noreverse)]\n";
125 report_fatal_error("Unrecognised tail-folding option");
126 }
127
128public:
129
130 void operator=(const std::string &Val) {
131 // If the user explicitly sets -sve-tail-folding= then treat as an error.
132 if (Val.empty()) {
133 reportError("");
134 return;
135 }
136
137 // Since the user is explicitly setting the option we don't automatically
138 // need the default unless they require it.
139 setNeedsDefault(false);
140
141 SmallVector<StringRef, 4> TailFoldTypes;
142 StringRef(Val).split(TailFoldTypes, '+', -1, false);
143
144 unsigned StartIdx = 1;
145 if (TailFoldTypes[0] == "disabled")
146 setInitialBits(TailFoldingOpts::Disabled);
147 else if (TailFoldTypes[0] == "all")
148 setInitialBits(TailFoldingOpts::All);
149 else if (TailFoldTypes[0] == "default")
150 setNeedsDefault(true);
151 else if (TailFoldTypes[0] == "simple")
152 setInitialBits(TailFoldingOpts::Simple);
153 else {
154 StartIdx = 0;
155 setInitialBits(TailFoldingOpts::Disabled);
156 }
157
158 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
159 if (TailFoldTypes[I] == "reductions")
160 setEnableBit(TailFoldingOpts::Reductions);
161 else if (TailFoldTypes[I] == "recurrences")
162 setEnableBit(TailFoldingOpts::Recurrences);
163 else if (TailFoldTypes[I] == "reverse")
164 setEnableBit(TailFoldingOpts::Reverse);
165 else if (TailFoldTypes[I] == "noreductions")
166 setDisableBit(TailFoldingOpts::Reductions);
167 else if (TailFoldTypes[I] == "norecurrences")
168 setDisableBit(TailFoldingOpts::Recurrences);
169 else if (TailFoldTypes[I] == "noreverse")
170 setDisableBit(TailFoldingOpts::Reverse);
171 else
172 reportError(Val);
173 }
174 }
175
176 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
177 return (getBits(DefaultBits) & Required) == Required;
178 }
179};
180} // namespace
181
182TailFoldingOption TailFoldingOptionLoc;
183
185 "sve-tail-folding",
186 cl::desc(
187 "Control the use of vectorisation using tail-folding for SVE where the"
188 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
189 "\ndisabled (Initial) No loop types will vectorize using "
190 "tail-folding"
191 "\ndefault (Initial) Uses the default tail-folding settings for "
192 "the target CPU"
193 "\nall (Initial) All legal loop types will vectorize using "
194 "tail-folding"
195 "\nsimple (Initial) Use tail-folding for simple loops (not "
196 "reductions or recurrences)"
197 "\nreductions Use tail-folding for loops containing reductions"
198 "\nnoreductions Inverse of above"
199 "\nrecurrences Use tail-folding for loops containing fixed order "
200 "recurrences"
201 "\nnorecurrences Inverse of above"
202 "\nreverse Use tail-folding for loops requiring reversed "
203 "predicates"
204 "\nnoreverse Inverse of above"),
206
207// Experimental option that will only be fully functional when the
208// code-generator is changed to use SVE instead of NEON for all fixed-width
209// operations.
211 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
212
213// Experimental option that will only be fully functional when the cost-model
214// and code-generator have been changed to avoid using scalable vector
215// instructions that are not legal in streaming SVE mode.
217 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
218
219static bool isSMEABIRoutineCall(const CallInst &CI) {
220 const auto *F = CI.getCalledFunction();
221 return F && StringSwitch<bool>(F->getName())
222 .Case("__arm_sme_state", true)
223 .Case("__arm_tpidr2_save", true)
224 .Case("__arm_tpidr2_restore", true)
225 .Case("__arm_za_disable", true)
226 .Default(false);
227}
228
229/// Returns true if the function has explicit operations that can only be
230/// lowered using incompatible instructions for the selected mode. This also
231/// returns true if the function F may use or modify ZA state.
233 for (const BasicBlock &BB : *F) {
234 for (const Instruction &I : BB) {
235 // Be conservative for now and assume that any call to inline asm or to
236 // intrinsics could could result in non-streaming ops (e.g. calls to
237 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
238 // all native LLVM instructions can be lowered to compatible instructions.
239 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
240 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
241 isSMEABIRoutineCall(cast<CallInst>(I))))
242 return true;
243 }
244 }
245 return false;
246}
247
249 const Function *Callee) const {
250 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
251
252 // When inlining, we should consider the body of the function, not the
253 // interface.
254 if (CalleeAttrs.hasStreamingBody()) {
255 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
256 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
257 }
258
259 if (CalleeAttrs.isNewZA())
260 return false;
261
262 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
263 CallerAttrs.requiresSMChange(CalleeAttrs) ||
264 CallerAttrs.requiresPreservingZT0(CalleeAttrs)) {
265 if (hasPossibleIncompatibleOps(Callee))
266 return false;
267 }
268
269 return BaseT::areInlineCompatible(Caller, Callee);
270}
271
273 const Function *Caller, const Function *Callee,
274 const ArrayRef<Type *> &Types) const {
275 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
276 return false;
277
278 // We need to ensure that argument promotion does not attempt to promote
279 // pointers to fixed-length vector types larger than 128 bits like
280 // <8 x float> (and pointers to aggregate types which have such fixed-length
281 // vector type members) into the values of the pointees. Such vector types
282 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
283 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
284 // types can be safely treated as 128-bit NEON types and they cannot be
285 // distinguished in IR.
286 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
287 auto FVTy = dyn_cast<FixedVectorType>(Ty);
288 return FVTy &&
289 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
290 }))
291 return false;
292
293 return true;
294}
295
296unsigned
298 unsigned DefaultCallPenalty) const {
299 // This function calculates a penalty for executing Call in F.
300 //
301 // There are two ways this function can be called:
302 // (1) F:
303 // call from F -> G (the call here is Call)
304 //
305 // For (1), Call.getCaller() == F, so it will always return a high cost if
306 // a streaming-mode change is required (thus promoting the need to inline the
307 // function)
308 //
309 // (2) F:
310 // call from F -> G (the call here is not Call)
311 // G:
312 // call from G -> H (the call here is Call)
313 //
314 // For (2), if after inlining the body of G into F the call to H requires a
315 // streaming-mode change, and the call to G from F would also require a
316 // streaming-mode change, then there is benefit to do the streaming-mode
317 // change only once and avoid inlining of G into F.
318 SMEAttrs FAttrs(*F);
319 SMEAttrs CalleeAttrs(Call);
320 if (FAttrs.requiresSMChange(CalleeAttrs)) {
321 if (F == Call.getCaller()) // (1)
322 return CallPenaltyChangeSM * DefaultCallPenalty;
323 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
324 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
325 }
326
327 return DefaultCallPenalty;
328}
329
334 ST->isNeonAvailable());
335}
336
337/// Calculate the cost of materializing a 64-bit value. This helper
338/// method might only calculate a fraction of a larger immediate. Therefore it
339/// is valid to return a cost of ZERO.
341 // Check if the immediate can be encoded within an instruction.
342 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
343 return 0;
344
345 if (Val < 0)
346 Val = ~Val;
347
348 // Calculate how many moves we will need to materialize this constant.
351 return Insn.size();
352}
353
354/// Calculate the cost of materializing the given constant.
357 assert(Ty->isIntegerTy());
358
359 unsigned BitSize = Ty->getPrimitiveSizeInBits();
360 if (BitSize == 0)
361 return ~0U;
362
363 // Sign-extend all constants to a multiple of 64-bit.
364 APInt ImmVal = Imm;
365 if (BitSize & 0x3f)
366 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
367
368 // Split the constant into 64-bit chunks and calculate the cost for each
369 // chunk.
371 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
372 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
373 int64_t Val = Tmp.getSExtValue();
374 Cost += getIntImmCost(Val);
375 }
376 // We need at least one instruction to materialze the constant.
377 return std::max<InstructionCost>(1, Cost);
378}
379
381 const APInt &Imm, Type *Ty,
383 Instruction *Inst) {
384 assert(Ty->isIntegerTy());
385
386 unsigned BitSize = Ty->getPrimitiveSizeInBits();
387 // There is no cost model for constants with a bit size of 0. Return TCC_Free
388 // here, so that constant hoisting will ignore this constant.
389 if (BitSize == 0)
390 return TTI::TCC_Free;
391
392 unsigned ImmIdx = ~0U;
393 switch (Opcode) {
394 default:
395 return TTI::TCC_Free;
396 case Instruction::GetElementPtr:
397 // Always hoist the base address of a GetElementPtr.
398 if (Idx == 0)
399 return 2 * TTI::TCC_Basic;
400 return TTI::TCC_Free;
401 case Instruction::Store:
402 ImmIdx = 0;
403 break;
404 case Instruction::Add:
405 case Instruction::Sub:
406 case Instruction::Mul:
407 case Instruction::UDiv:
408 case Instruction::SDiv:
409 case Instruction::URem:
410 case Instruction::SRem:
411 case Instruction::And:
412 case Instruction::Or:
413 case Instruction::Xor:
414 case Instruction::ICmp:
415 ImmIdx = 1;
416 break;
417 // Always return TCC_Free for the shift value of a shift instruction.
418 case Instruction::Shl:
419 case Instruction::LShr:
420 case Instruction::AShr:
421 if (Idx == 1)
422 return TTI::TCC_Free;
423 break;
424 case Instruction::Trunc:
425 case Instruction::ZExt:
426 case Instruction::SExt:
427 case Instruction::IntToPtr:
428 case Instruction::PtrToInt:
429 case Instruction::BitCast:
430 case Instruction::PHI:
431 case Instruction::Call:
432 case Instruction::Select:
433 case Instruction::Ret:
434 case Instruction::Load:
435 break;
436 }
437
438 if (Idx == ImmIdx) {
439 int NumConstants = (BitSize + 63) / 64;
441 return (Cost <= NumConstants * TTI::TCC_Basic)
442 ? static_cast<int>(TTI::TCC_Free)
443 : Cost;
444 }
446}
447
450 const APInt &Imm, Type *Ty,
452 assert(Ty->isIntegerTy());
453
454 unsigned BitSize = Ty->getPrimitiveSizeInBits();
455 // There is no cost model for constants with a bit size of 0. Return TCC_Free
456 // here, so that constant hoisting will ignore this constant.
457 if (BitSize == 0)
458 return TTI::TCC_Free;
459
460 // Most (all?) AArch64 intrinsics do not support folding immediates into the
461 // selected instruction, so we compute the materialization cost for the
462 // immediate directly.
463 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
465
466 switch (IID) {
467 default:
468 return TTI::TCC_Free;
469 case Intrinsic::sadd_with_overflow:
470 case Intrinsic::uadd_with_overflow:
471 case Intrinsic::ssub_with_overflow:
472 case Intrinsic::usub_with_overflow:
473 case Intrinsic::smul_with_overflow:
474 case Intrinsic::umul_with_overflow:
475 if (Idx == 1) {
476 int NumConstants = (BitSize + 63) / 64;
478 return (Cost <= NumConstants * TTI::TCC_Basic)
479 ? static_cast<int>(TTI::TCC_Free)
480 : Cost;
481 }
482 break;
483 case Intrinsic::experimental_stackmap:
484 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
485 return TTI::TCC_Free;
486 break;
487 case Intrinsic::experimental_patchpoint_void:
488 case Intrinsic::experimental_patchpoint:
489 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
490 return TTI::TCC_Free;
491 break;
492 case Intrinsic::experimental_gc_statepoint:
493 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
494 return TTI::TCC_Free;
495 break;
496 }
498}
499
502 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
503 if (TyWidth == 32 || TyWidth == 64)
505 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
506 return TTI::PSK_Software;
507}
508
509static bool isUnpackedVectorVT(EVT VecVT) {
510 return VecVT.isScalableVector() &&
512}
513
515 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
516 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
517 unsigned TotalHistCnts = 1;
518
519 unsigned EltSize = EltTy->getScalarSizeInBits();
520 // Only allow (up to 64b) integers or pointers
521 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
523
524 // FIXME: We should be able to generate histcnt for fixed-length vectors
525 // using ptrue with a specific VL.
526 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
527 unsigned EC = VTy->getElementCount().getKnownMinValue();
528 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
530
531 // HistCnt only supports 32b and 64b element types
532 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
533
534 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
536
537 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
538 TotalHistCnts = EC / NaturalVectorWidth;
539 }
540
541 return InstructionCost(BaseHistCntCost * TotalHistCnts);
542}
543
547 // The code-generator is currently not able to handle scalable vectors
548 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
549 // it. This change will be removed when code-generation for these types is
550 // sufficiently reliable.
551 auto *RetTy = ICA.getReturnType();
552 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
553 if (VTy->getElementCount() == ElementCount::getScalable(1))
555
556 switch (ICA.getID()) {
557 case Intrinsic::experimental_vector_histogram_add:
558 if (!ST->hasSVE2())
560 return getHistogramCost(ICA);
561 case Intrinsic::umin:
562 case Intrinsic::umax:
563 case Intrinsic::smin:
564 case Intrinsic::smax: {
565 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
566 MVT::v8i16, MVT::v2i32, MVT::v4i32,
567 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
568 MVT::nxv2i64};
570 // v2i64 types get converted to cmp+bif hence the cost of 2
571 if (LT.second == MVT::v2i64)
572 return LT.first * 2;
573 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
574 return LT.first;
575 break;
576 }
577 case Intrinsic::sadd_sat:
578 case Intrinsic::ssub_sat:
579 case Intrinsic::uadd_sat:
580 case Intrinsic::usub_sat: {
581 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
582 MVT::v8i16, MVT::v2i32, MVT::v4i32,
583 MVT::v2i64};
585 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
586 // need to extend the type, as it uses shr(qadd(shl, shl)).
587 unsigned Instrs =
588 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
589 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
590 return LT.first * Instrs;
591 break;
592 }
593 case Intrinsic::abs: {
594 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
595 MVT::v8i16, MVT::v2i32, MVT::v4i32,
596 MVT::v2i64};
598 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
599 return LT.first;
600 break;
601 }
602 case Intrinsic::bswap: {
603 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
604 MVT::v4i32, MVT::v2i64};
606 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
607 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
608 return LT.first;
609 break;
610 }
611 case Intrinsic::stepvector: {
612 InstructionCost Cost = 1; // Cost of the `index' instruction
614 // Legalisation of illegal vectors involves an `index' instruction plus
615 // (LT.first - 1) vector adds.
616 if (LT.first > 1) {
617 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
618 InstructionCost AddCost =
619 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
620 Cost += AddCost * (LT.first - 1);
621 }
622 return Cost;
623 }
624 case Intrinsic::vector_extract:
625 case Intrinsic::vector_insert: {
626 // If both the vector and subvector types are legal types and the index
627 // is 0, then this should be a no-op or simple operation; return a
628 // relatively low cost.
629
630 // If arguments aren't actually supplied, then we cannot determine the
631 // value of the index. We also want to skip predicate types.
632 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
634 break;
635
636 LLVMContext &C = RetTy->getContext();
637 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
638 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
639 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
640 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
641 // Skip this if either the vector or subvector types are unpacked
642 // SVE types; they may get lowered to stack stores and loads.
643 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
644 break;
645
647 getTLI()->getTypeConversion(C, SubVecVT);
649 getTLI()->getTypeConversion(C, VecVT);
650 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
651 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
652 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
653 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
654 return TTI::TCC_Free;
655 break;
656 }
657 case Intrinsic::bitreverse: {
658 static const CostTblEntry BitreverseTbl[] = {
659 {Intrinsic::bitreverse, MVT::i32, 1},
660 {Intrinsic::bitreverse, MVT::i64, 1},
661 {Intrinsic::bitreverse, MVT::v8i8, 1},
662 {Intrinsic::bitreverse, MVT::v16i8, 1},
663 {Intrinsic::bitreverse, MVT::v4i16, 2},
664 {Intrinsic::bitreverse, MVT::v8i16, 2},
665 {Intrinsic::bitreverse, MVT::v2i32, 2},
666 {Intrinsic::bitreverse, MVT::v4i32, 2},
667 {Intrinsic::bitreverse, MVT::v1i64, 2},
668 {Intrinsic::bitreverse, MVT::v2i64, 2},
669 };
670 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
671 const auto *Entry =
672 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
673 if (Entry) {
674 // Cost Model is using the legal type(i32) that i8 and i16 will be
675 // converted to +1 so that we match the actual lowering cost
676 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
677 TLI->getValueType(DL, RetTy, true) == MVT::i16)
678 return LegalisationCost.first * Entry->Cost + 1;
679
680 return LegalisationCost.first * Entry->Cost;
681 }
682 break;
683 }
684 case Intrinsic::ctpop: {
685 if (!ST->hasNEON()) {
686 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
687 return getTypeLegalizationCost(RetTy).first * 12;
688 }
689 static const CostTblEntry CtpopCostTbl[] = {
690 {ISD::CTPOP, MVT::v2i64, 4},
691 {ISD::CTPOP, MVT::v4i32, 3},
692 {ISD::CTPOP, MVT::v8i16, 2},
693 {ISD::CTPOP, MVT::v16i8, 1},
694 {ISD::CTPOP, MVT::i64, 4},
695 {ISD::CTPOP, MVT::v2i32, 3},
696 {ISD::CTPOP, MVT::v4i16, 2},
697 {ISD::CTPOP, MVT::v8i8, 1},
698 {ISD::CTPOP, MVT::i32, 5},
699 };
701 MVT MTy = LT.second;
702 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
703 // Extra cost of +1 when illegal vector types are legalized by promoting
704 // the integer type.
705 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
706 RetTy->getScalarSizeInBits()
707 ? 1
708 : 0;
709 return LT.first * Entry->Cost + ExtraCost;
710 }
711 break;
712 }
713 case Intrinsic::sadd_with_overflow:
714 case Intrinsic::uadd_with_overflow:
715 case Intrinsic::ssub_with_overflow:
716 case Intrinsic::usub_with_overflow:
717 case Intrinsic::smul_with_overflow:
718 case Intrinsic::umul_with_overflow: {
719 static const CostTblEntry WithOverflowCostTbl[] = {
720 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
721 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
722 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
723 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
724 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
725 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
726 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
727 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
728 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
729 {Intrinsic::usub_with_overflow, MVT::i8, 3},
730 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
731 {Intrinsic::usub_with_overflow, MVT::i16, 3},
732 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
733 {Intrinsic::usub_with_overflow, MVT::i32, 1},
734 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
735 {Intrinsic::usub_with_overflow, MVT::i64, 1},
736 {Intrinsic::smul_with_overflow, MVT::i8, 5},
737 {Intrinsic::umul_with_overflow, MVT::i8, 4},
738 {Intrinsic::smul_with_overflow, MVT::i16, 5},
739 {Intrinsic::umul_with_overflow, MVT::i16, 4},
740 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
741 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
742 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
743 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
744 };
745 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
746 if (MTy.isSimple())
747 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
748 MTy.getSimpleVT()))
749 return Entry->Cost;
750 break;
751 }
752 case Intrinsic::fptosi_sat:
753 case Intrinsic::fptoui_sat: {
754 if (ICA.getArgTypes().empty())
755 break;
756 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
757 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
758 EVT MTy = TLI->getValueType(DL, RetTy);
759 // Check for the legal types, which are where the size of the input and the
760 // output are the same, or we are using cvt f64->i32 or f32->i64.
761 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
762 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
763 LT.second == MVT::v2f64)) {
764 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
765 (LT.second == MVT::f64 && MTy == MVT::i32) ||
766 (LT.second == MVT::f32 && MTy == MVT::i64)))
767 return LT.first;
768 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
769 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
770 MTy.getScalarSizeInBits() == 64)
771 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
772 }
773 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
774 // f32.
775 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
776 return LT.first + getIntrinsicInstrCost(
777 {ICA.getID(),
778 RetTy,
779 {ICA.getArgTypes()[0]->getWithNewType(
780 Type::getFloatTy(RetTy->getContext()))}},
781 CostKind);
782 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
783 (LT.second == MVT::f16 && MTy == MVT::i64) ||
784 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
785 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
786 return LT.first;
787 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
788 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
789 MTy.getScalarSizeInBits() == 32)
790 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
791 // Extending vector types v8f16->v8i32. These current scalarize but the
792 // codegen could be better.
793 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
794 MTy.getScalarSizeInBits() == 64)
795 return MTy.getVectorNumElements() * 3;
796
797 // If we can we use a legal convert followed by a min+max
798 if ((LT.second.getScalarType() == MVT::f32 ||
799 LT.second.getScalarType() == MVT::f64 ||
800 LT.second.getScalarType() == MVT::f16) &&
801 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
802 Type *LegalTy =
803 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
804 if (LT.second.isVector())
805 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
807 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
808 LegalTy, {LegalTy, LegalTy});
810 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
811 LegalTy, {LegalTy, LegalTy});
813 return LT.first * Cost +
814 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
815 : 1);
816 }
817 // Otherwise we need to follow the default expansion that clamps the value
818 // using a float min/max with a fcmp+sel for nan handling when signed.
819 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
820 RetTy = RetTy->getScalarType();
821 if (LT.second.isVector()) {
822 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
823 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
824 }
825 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
827 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
829 Cost +=
830 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
832 if (IsSigned) {
833 Type *CondTy = RetTy->getWithNewBitWidth(1);
834 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
836 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
838 }
839 return LT.first * Cost;
840 }
841 case Intrinsic::fshl:
842 case Intrinsic::fshr: {
843 if (ICA.getArgs().empty())
844 break;
845
846 // TODO: Add handling for fshl where third argument is not a constant.
847 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
848 if (!OpInfoZ.isConstant())
849 break;
850
851 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
852 if (OpInfoZ.isUniform()) {
853 // FIXME: The costs could be lower if the codegen is better.
854 static const CostTblEntry FshlTbl[] = {
855 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
856 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
857 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
858 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
859 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
860 // to avoid having to duplicate the costs.
861 const auto *Entry =
862 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
863 if (Entry)
864 return LegalisationCost.first * Entry->Cost;
865 }
866
867 auto TyL = getTypeLegalizationCost(RetTy);
868 if (!RetTy->isIntegerTy())
869 break;
870
871 // Estimate cost manually, as types like i8 and i16 will get promoted to
872 // i32 and CostTableLookup will ignore the extra conversion cost.
873 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
874 RetTy->getScalarSizeInBits() < 64) ||
875 (RetTy->getScalarSizeInBits() % 64 != 0);
876 unsigned ExtraCost = HigherCost ? 1 : 0;
877 if (RetTy->getScalarSizeInBits() == 32 ||
878 RetTy->getScalarSizeInBits() == 64)
879 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
880 // extr instruction.
881 else if (HigherCost)
882 ExtraCost = 1;
883 else
884 break;
885 return TyL.first + ExtraCost;
886 }
887 case Intrinsic::get_active_lane_mask: {
888 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
889 if (RetTy) {
890 EVT RetVT = getTLI()->getValueType(DL, RetTy);
891 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
892 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
893 !getTLI()->isTypeLegal(RetVT)) {
894 // We don't have enough context at this point to determine if the mask
895 // is going to be kept live after the block, which will force the vXi1
896 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
897 // For now, we just assume the vectorizer created this intrinsic and
898 // the result will be the input for a PHI. In this case the cost will
899 // be extremely high for fixed-width vectors.
900 // NOTE: getScalarizationOverhead returns a cost that's far too
901 // pessimistic for the actual generated codegen. In reality there are
902 // two instructions generated per lane.
903 return RetTy->getNumElements() * 2;
904 }
905 }
906 break;
907 }
908 case Intrinsic::experimental_vector_match: {
909 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
910 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
911 unsigned SearchSize = NeedleTy->getNumElements();
912 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
913 // Base cost for MATCH instructions. At least on the Neoverse V2 and
914 // Neoverse V3, these are cheap operations with the same latency as a
915 // vector ADD. In most cases, however, we also need to do an extra DUP.
916 // For fixed-length vectors we currently need an extra five--six
917 // instructions besides the MATCH.
919 if (isa<FixedVectorType>(RetTy))
920 Cost += 10;
921 return Cost;
922 }
923 break;
924 }
925 default:
926 break;
927 }
929}
930
931/// The function will remove redundant reinterprets casting in the presence
932/// of the control flow
933static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
934 IntrinsicInst &II) {
936 auto RequiredType = II.getType();
937
938 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
939 assert(PN && "Expected Phi Node!");
940
941 // Don't create a new Phi unless we can remove the old one.
942 if (!PN->hasOneUse())
943 return std::nullopt;
944
945 for (Value *IncValPhi : PN->incoming_values()) {
946 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
947 if (!Reinterpret ||
948 Reinterpret->getIntrinsicID() !=
949 Intrinsic::aarch64_sve_convert_to_svbool ||
950 RequiredType != Reinterpret->getArgOperand(0)->getType())
951 return std::nullopt;
952 }
953
954 // Create the new Phi
955 IC.Builder.SetInsertPoint(PN);
956 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
957 Worklist.push_back(PN);
958
959 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
960 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
961 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
962 Worklist.push_back(Reinterpret);
963 }
964
965 // Cleanup Phi Node and reinterprets
966 return IC.replaceInstUsesWith(II, NPN);
967}
968
969// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
970// => (binop (pred) (from_svbool _) (from_svbool _))
971//
972// The above transformation eliminates a `to_svbool` in the predicate
973// operand of bitwise operation `binop` by narrowing the vector width of
974// the operation. For example, it would convert a `<vscale x 16 x i1>
975// and` into a `<vscale x 4 x i1> and`. This is profitable because
976// to_svbool must zero the new lanes during widening, whereas
977// from_svbool is free.
978static std::optional<Instruction *>
980 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
981 if (!BinOp)
982 return std::nullopt;
983
984 auto IntrinsicID = BinOp->getIntrinsicID();
985 switch (IntrinsicID) {
986 case Intrinsic::aarch64_sve_and_z:
987 case Intrinsic::aarch64_sve_bic_z:
988 case Intrinsic::aarch64_sve_eor_z:
989 case Intrinsic::aarch64_sve_nand_z:
990 case Intrinsic::aarch64_sve_nor_z:
991 case Intrinsic::aarch64_sve_orn_z:
992 case Intrinsic::aarch64_sve_orr_z:
993 break;
994 default:
995 return std::nullopt;
996 }
997
998 auto BinOpPred = BinOp->getOperand(0);
999 auto BinOpOp1 = BinOp->getOperand(1);
1000 auto BinOpOp2 = BinOp->getOperand(2);
1001
1002 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1003 if (!PredIntr ||
1004 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1005 return std::nullopt;
1006
1007 auto PredOp = PredIntr->getOperand(0);
1008 auto PredOpTy = cast<VectorType>(PredOp->getType());
1009 if (PredOpTy != II.getType())
1010 return std::nullopt;
1011
1012 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1013 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1014 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1015 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1016 if (BinOpOp1 == BinOpOp2)
1017 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1018 else
1019 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1020 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1021
1022 auto NarrowedBinOp =
1023 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1024 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1025}
1026
1027static std::optional<Instruction *>
1029 // If the reinterpret instruction operand is a PHI Node
1030 if (isa<PHINode>(II.getArgOperand(0)))
1031 return processPhiNode(IC, II);
1032
1033 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1034 return BinOpCombine;
1035
1036 // Ignore converts to/from svcount_t.
1037 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1038 isa<TargetExtType>(II.getType()))
1039 return std::nullopt;
1040
1041 SmallVector<Instruction *, 32> CandidatesForRemoval;
1042 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1043
1044 const auto *IVTy = cast<VectorType>(II.getType());
1045
1046 // Walk the chain of conversions.
1047 while (Cursor) {
1048 // If the type of the cursor has fewer lanes than the final result, zeroing
1049 // must take place, which breaks the equivalence chain.
1050 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1051 if (CursorVTy->getElementCount().getKnownMinValue() <
1052 IVTy->getElementCount().getKnownMinValue())
1053 break;
1054
1055 // If the cursor has the same type as I, it is a viable replacement.
1056 if (Cursor->getType() == IVTy)
1057 EarliestReplacement = Cursor;
1058
1059 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1060
1061 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1062 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1063 Intrinsic::aarch64_sve_convert_to_svbool ||
1064 IntrinsicCursor->getIntrinsicID() ==
1065 Intrinsic::aarch64_sve_convert_from_svbool))
1066 break;
1067
1068 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1069 Cursor = IntrinsicCursor->getOperand(0);
1070 }
1071
1072 // If no viable replacement in the conversion chain was found, there is
1073 // nothing to do.
1074 if (!EarliestReplacement)
1075 return std::nullopt;
1076
1077 return IC.replaceInstUsesWith(II, EarliestReplacement);
1078}
1079
1080static bool isAllActivePredicate(Value *Pred) {
1081 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1082 Value *UncastedPred;
1083 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1084 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1085 m_Value(UncastedPred)))))
1086 // If the predicate has the same or less lanes than the uncasted
1087 // predicate then we know the casting has no effect.
1088 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1089 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1090 Pred = UncastedPred;
1091
1092 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1093 m_ConstantInt<AArch64SVEPredPattern::all>()));
1094}
1095
1096// Simplify unary operation where predicate has all inactive lanes by replacing
1097// instruction with its operand
1098static std::optional<Instruction *>
1100 bool hasInactiveVector) {
1101 int PredOperand = hasInactiveVector ? 1 : 0;
1102 int ReplaceOperand = hasInactiveVector ? 0 : 1;
1103 if (match(II.getOperand(PredOperand), m_ZeroInt())) {
1104 IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
1105 return IC.eraseInstFromFunction(II);
1106 }
1107 return std::nullopt;
1108}
1109
1110// Simplify unary operation where predicate has all inactive lanes or
1111// replace unused first operand with undef when all lanes are active
1112static std::optional<Instruction *>
1114 if (isAllActivePredicate(II.getOperand(1)) &&
1115 !isa<llvm::UndefValue>(II.getOperand(0)) &&
1116 !isa<llvm::PoisonValue>(II.getOperand(0))) {
1117 Value *Undef = llvm::UndefValue::get(II.getType());
1118 return IC.replaceOperand(II, 0, Undef);
1119 }
1120 return instCombineSVENoActiveReplace(IC, II, true);
1121}
1122
1123// Erase unary operation where predicate has all inactive lanes
1124static std::optional<Instruction *>
1126 int PredPos) {
1127 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1128 return IC.eraseInstFromFunction(II);
1129 }
1130 return std::nullopt;
1131}
1132
1133// Simplify operation where predicate has all inactive lanes by replacing
1134// instruction with zeroed object
1135static std::optional<Instruction *>
1137 if (match(II.getOperand(0), m_ZeroInt())) {
1138 Constant *Node;
1139 Type *RetTy = II.getType();
1140 if (RetTy->isStructTy()) {
1141 auto StructT = cast<StructType>(RetTy);
1142 auto VecT = StructT->getElementType(0);
1144 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1145 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1146 : ConstantInt::get(VecT, 0));
1147 }
1148 Node = ConstantStruct::get(StructT, ZerVec);
1149 } else
1150 Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1151 : ConstantInt::get(II.getType(), 0);
1152
1154 return IC.eraseInstFromFunction(II);
1155 }
1156 return std::nullopt;
1157}
1158
1159static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1160 IntrinsicInst &II) {
1161 // svsel(ptrue, x, y) => x
1162 auto *OpPredicate = II.getOperand(0);
1163 if (isAllActivePredicate(OpPredicate))
1164 return IC.replaceInstUsesWith(II, II.getOperand(1));
1165
1166 auto Select =
1167 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1168 return IC.replaceInstUsesWith(II, Select);
1169}
1170
1171static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1172 IntrinsicInst &II) {
1173 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1174 if (!Pg)
1175 return std::nullopt;
1176
1177 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1178 return std::nullopt;
1179
1180 const auto PTruePattern =
1181 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1182 if (PTruePattern != AArch64SVEPredPattern::vl1)
1183 return std::nullopt;
1184
1185 // The intrinsic is inserting into lane zero so use an insert instead.
1186 auto *IdxTy = Type::getInt64Ty(II.getContext());
1187 auto *Insert = InsertElementInst::Create(
1188 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1189 Insert->insertBefore(&II);
1190 Insert->takeName(&II);
1191
1192 return IC.replaceInstUsesWith(II, Insert);
1193}
1194
1195static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1196 IntrinsicInst &II) {
1197 // Replace DupX with a regular IR splat.
1198 auto *RetTy = cast<ScalableVectorType>(II.getType());
1199 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1200 II.getArgOperand(0));
1201 Splat->takeName(&II);
1202 return IC.replaceInstUsesWith(II, Splat);
1203}
1204
1205static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1206 IntrinsicInst &II) {
1207 LLVMContext &Ctx = II.getContext();
1208
1209 // Replace by zero constant when all lanes are inactive
1210 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1211 return II_NA;
1212
1213 // Check that the predicate is all active
1214 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1215 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1216 return std::nullopt;
1217
1218 const auto PTruePattern =
1219 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1220 if (PTruePattern != AArch64SVEPredPattern::all)
1221 return std::nullopt;
1222
1223 // Check that we have a compare of zero..
1224 auto *SplatValue =
1225 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1226 if (!SplatValue || !SplatValue->isZero())
1227 return std::nullopt;
1228
1229 // ..against a dupq
1230 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1231 if (!DupQLane ||
1232 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1233 return std::nullopt;
1234
1235 // Where the dupq is a lane 0 replicate of a vector insert
1236 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1237 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1238 return std::nullopt;
1239
1240 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1241 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1242 return std::nullopt;
1243
1244 // Where the vector insert is a fixed constant vector insert into undef at
1245 // index zero
1246 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1247 return std::nullopt;
1248
1249 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1250 return std::nullopt;
1251
1252 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1253 if (!ConstVec)
1254 return std::nullopt;
1255
1256 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1257 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1258 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1259 return std::nullopt;
1260
1261 unsigned NumElts = VecTy->getNumElements();
1262 unsigned PredicateBits = 0;
1263
1264 // Expand intrinsic operands to a 16-bit byte level predicate
1265 for (unsigned I = 0; I < NumElts; ++I) {
1266 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1267 if (!Arg)
1268 return std::nullopt;
1269 if (!Arg->isZero())
1270 PredicateBits |= 1 << (I * (16 / NumElts));
1271 }
1272
1273 // If all bits are zero bail early with an empty predicate
1274 if (PredicateBits == 0) {
1275 auto *PFalse = Constant::getNullValue(II.getType());
1276 PFalse->takeName(&II);
1277 return IC.replaceInstUsesWith(II, PFalse);
1278 }
1279
1280 // Calculate largest predicate type used (where byte predicate is largest)
1281 unsigned Mask = 8;
1282 for (unsigned I = 0; I < 16; ++I)
1283 if ((PredicateBits & (1 << I)) != 0)
1284 Mask |= (I % 8);
1285
1286 unsigned PredSize = Mask & -Mask;
1287 auto *PredType = ScalableVectorType::get(
1288 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1289
1290 // Ensure all relevant bits are set
1291 for (unsigned I = 0; I < 16; I += PredSize)
1292 if ((PredicateBits & (1 << I)) == 0)
1293 return std::nullopt;
1294
1295 auto *PTruePat =
1296 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1297 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1298 {PredType}, {PTruePat});
1299 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1300 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1301 auto *ConvertFromSVBool =
1302 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1303 {II.getType()}, {ConvertToSVBool});
1304
1305 ConvertFromSVBool->takeName(&II);
1306 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1307}
1308
1309static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1310 IntrinsicInst &II) {
1311 Value *Pg = II.getArgOperand(0);
1312 Value *Vec = II.getArgOperand(1);
1313 auto IntrinsicID = II.getIntrinsicID();
1314 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1315
1316 // lastX(splat(X)) --> X
1317 if (auto *SplatVal = getSplatValue(Vec))
1318 return IC.replaceInstUsesWith(II, SplatVal);
1319
1320 // If x and/or y is a splat value then:
1321 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1322 Value *LHS, *RHS;
1323 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1324 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1325 auto *OldBinOp = cast<BinaryOperator>(Vec);
1326 auto OpC = OldBinOp->getOpcode();
1327 auto *NewLHS =
1328 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1329 auto *NewRHS =
1330 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1332 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1333 return IC.replaceInstUsesWith(II, NewBinOp);
1334 }
1335 }
1336
1337 auto *C = dyn_cast<Constant>(Pg);
1338 if (IsAfter && C && C->isNullValue()) {
1339 // The intrinsic is extracting lane 0 so use an extract instead.
1340 auto *IdxTy = Type::getInt64Ty(II.getContext());
1341 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1342 Extract->insertBefore(&II);
1343 Extract->takeName(&II);
1344 return IC.replaceInstUsesWith(II, Extract);
1345 }
1346
1347 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1348 if (!IntrPG)
1349 return std::nullopt;
1350
1351 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1352 return std::nullopt;
1353
1354 const auto PTruePattern =
1355 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1356
1357 // Can the intrinsic's predicate be converted to a known constant index?
1358 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1359 if (!MinNumElts)
1360 return std::nullopt;
1361
1362 unsigned Idx = MinNumElts - 1;
1363 // Increment the index if extracting the element after the last active
1364 // predicate element.
1365 if (IsAfter)
1366 ++Idx;
1367
1368 // Ignore extracts whose index is larger than the known minimum vector
1369 // length. NOTE: This is an artificial constraint where we prefer to
1370 // maintain what the user asked for until an alternative is proven faster.
1371 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1372 if (Idx >= PgVTy->getMinNumElements())
1373 return std::nullopt;
1374
1375 // The intrinsic is extracting a fixed lane so use an extract instead.
1376 auto *IdxTy = Type::getInt64Ty(II.getContext());
1377 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1378 Extract->insertBefore(&II);
1379 Extract->takeName(&II);
1380 return IC.replaceInstUsesWith(II, Extract);
1381}
1382
1383static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1384 IntrinsicInst &II) {
1385 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1386 // integer variant across a variety of micro-architectures. Replace scalar
1387 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1388 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1389 // depending on the micro-architecture, but has been observed as generally
1390 // being faster, particularly when the CLAST[AB] op is a loop-carried
1391 // dependency.
1392 Value *Pg = II.getArgOperand(0);
1393 Value *Fallback = II.getArgOperand(1);
1394 Value *Vec = II.getArgOperand(2);
1395 Type *Ty = II.getType();
1396
1397 if (!Ty->isIntegerTy())
1398 return std::nullopt;
1399
1400 Type *FPTy;
1401 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1402 default:
1403 return std::nullopt;
1404 case 16:
1405 FPTy = IC.Builder.getHalfTy();
1406 break;
1407 case 32:
1408 FPTy = IC.Builder.getFloatTy();
1409 break;
1410 case 64:
1411 FPTy = IC.Builder.getDoubleTy();
1412 break;
1413 }
1414
1415 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1416 auto *FPVTy = VectorType::get(
1417 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1418 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1419 auto *FPII = IC.Builder.CreateIntrinsic(
1420 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1421 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1422 return IC.replaceInstUsesWith(II, FPIItoInt);
1423}
1424
1425static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1426 IntrinsicInst &II) {
1427 LLVMContext &Ctx = II.getContext();
1428 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1429 // can work with RDFFR_PP for ptest elimination.
1430 auto *AllPat =
1431 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1432 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1433 {II.getType()}, {AllPat});
1434 auto *RDFFR =
1435 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1436 RDFFR->takeName(&II);
1437 return IC.replaceInstUsesWith(II, RDFFR);
1438}
1439
1440static std::optional<Instruction *>
1442 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1443
1444 if (Pattern == AArch64SVEPredPattern::all) {
1445 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1446 auto *VScale = IC.Builder.CreateVScale(StepVal);
1447 VScale->takeName(&II);
1448 return IC.replaceInstUsesWith(II, VScale);
1449 }
1450
1451 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1452
1453 return MinNumElts && NumElts >= MinNumElts
1454 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1455 II, ConstantInt::get(II.getType(), MinNumElts)))
1456 : std::nullopt;
1457}
1458
1459static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1460 IntrinsicInst &II) {
1461 Value *PgVal = II.getArgOperand(0);
1462 Value *OpVal = II.getArgOperand(1);
1463
1464 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1465 // Later optimizations prefer this form.
1466 if (PgVal == OpVal &&
1467 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1468 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1469 Value *Ops[] = {PgVal, OpVal};
1470 Type *Tys[] = {PgVal->getType()};
1471
1472 auto *PTest =
1473 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1474 PTest->takeName(&II);
1475
1476 return IC.replaceInstUsesWith(II, PTest);
1477 }
1478
1479 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1480 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1481
1482 if (!Pg || !Op)
1483 return std::nullopt;
1484
1485 Intrinsic::ID OpIID = Op->getIntrinsicID();
1486
1487 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1488 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1489 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1490 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1491 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1492
1493 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1494
1495 PTest->takeName(&II);
1496 return IC.replaceInstUsesWith(II, PTest);
1497 }
1498
1499 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1500 // Later optimizations may rewrite sequence to use the flag-setting variant
1501 // of instruction X to remove PTEST.
1502 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1503 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1504 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1505 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1506 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1507 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1508 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1509 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1510 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1511 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1512 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1513 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1514 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1515 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1516 Type *Tys[] = {Pg->getType()};
1517
1518 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1519 PTest->takeName(&II);
1520
1521 return IC.replaceInstUsesWith(II, PTest);
1522 }
1523
1524 return std::nullopt;
1525}
1526
1527template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1528static std::optional<Instruction *>
1530 bool MergeIntoAddendOp) {
1531 Value *P = II.getOperand(0);
1532 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1533 if (MergeIntoAddendOp) {
1534 AddendOp = II.getOperand(1);
1535 Mul = II.getOperand(2);
1536 } else {
1537 AddendOp = II.getOperand(2);
1538 Mul = II.getOperand(1);
1539 }
1540
1541 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1542 m_Value(MulOp1))))
1543 return std::nullopt;
1544
1545 if (!Mul->hasOneUse())
1546 return std::nullopt;
1547
1548 Instruction *FMFSource = nullptr;
1549 if (II.getType()->isFPOrFPVectorTy()) {
1550 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1551 // Stop the combine when the flags on the inputs differ in case dropping
1552 // flags would lead to us missing out on more beneficial optimizations.
1553 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1554 return std::nullopt;
1555 if (!FAddFlags.allowContract())
1556 return std::nullopt;
1557 FMFSource = &II;
1558 }
1559
1560 CallInst *Res;
1561 if (MergeIntoAddendOp)
1562 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1563 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1564 else
1565 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1566 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1567
1568 return IC.replaceInstUsesWith(II, Res);
1569}
1570
1571static std::optional<Instruction *>
1573 Value *Pred = II.getOperand(0);
1574 Value *PtrOp = II.getOperand(1);
1575 Type *VecTy = II.getType();
1576
1577 // Replace by zero constant when all lanes are inactive
1578 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1579 return II_NA;
1580
1581 if (isAllActivePredicate(Pred)) {
1582 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1583 Load->copyMetadata(II);
1584 return IC.replaceInstUsesWith(II, Load);
1585 }
1586
1587 CallInst *MaskedLoad =
1588 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1589 Pred, ConstantAggregateZero::get(VecTy));
1590 MaskedLoad->copyMetadata(II);
1591 return IC.replaceInstUsesWith(II, MaskedLoad);
1592}
1593
1594static std::optional<Instruction *>
1596 Value *VecOp = II.getOperand(0);
1597 Value *Pred = II.getOperand(1);
1598 Value *PtrOp = II.getOperand(2);
1599
1600 if (isAllActivePredicate(Pred)) {
1601 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1602 Store->copyMetadata(II);
1603 return IC.eraseInstFromFunction(II);
1604 }
1605
1606 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1607 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1608 MaskedStore->copyMetadata(II);
1609 return IC.eraseInstFromFunction(II);
1610}
1611
1613 switch (Intrinsic) {
1614 case Intrinsic::aarch64_sve_fmul_u:
1615 return Instruction::BinaryOps::FMul;
1616 case Intrinsic::aarch64_sve_fadd_u:
1617 return Instruction::BinaryOps::FAdd;
1618 case Intrinsic::aarch64_sve_fsub_u:
1619 return Instruction::BinaryOps::FSub;
1620 default:
1621 return Instruction::BinaryOpsEnd;
1622 }
1623}
1624
1625static std::optional<Instruction *>
1627 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1628 if (II.isStrictFP())
1629 return std::nullopt;
1630
1631 auto *OpPredicate = II.getOperand(0);
1632 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1633 if (BinOpCode == Instruction::BinaryOpsEnd ||
1634 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1635 m_ConstantInt<AArch64SVEPredPattern::all>())))
1636 return std::nullopt;
1638 IC.Builder.setFastMathFlags(II.getFastMathFlags());
1639 auto BinOp =
1640 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1641 return IC.replaceInstUsesWith(II, BinOp);
1642}
1643
1644// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1645// sve.add_u).
1646static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1647 Intrinsic::ID IID) {
1648 auto *OpPredicate = II.getOperand(0);
1649 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1650 m_ConstantInt<AArch64SVEPredPattern::all>())))
1651 return std::nullopt;
1652
1653 auto *Mod = II.getModule();
1654 auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
1655 II.setCalledFunction(NewDecl);
1656
1657 return &II;
1658}
1659
1660// Simplify operations where predicate has all inactive lanes or try to replace
1661// with _u form when all lanes are active
1662static std::optional<Instruction *>
1664 Intrinsic::ID IID) {
1665 if (match(II.getOperand(0), m_ZeroInt())) {
1666 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1667 // inactive for sv[func]_m
1668 return IC.replaceInstUsesWith(II, II.getOperand(1));
1669 }
1670 return instCombineSVEAllActive(II, IID);
1671}
1672
1673static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1674 IntrinsicInst &II) {
1675 if (auto II_U =
1676 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1677 return II_U;
1678 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1679 Intrinsic::aarch64_sve_mla>(
1680 IC, II, true))
1681 return MLA;
1682 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1683 Intrinsic::aarch64_sve_mad>(
1684 IC, II, false))
1685 return MAD;
1686 return std::nullopt;
1687}
1688
1689static std::optional<Instruction *>
1691 if (auto II_U =
1692 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1693 return II_U;
1694 if (auto FMLA =
1695 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1696 Intrinsic::aarch64_sve_fmla>(IC, II,
1697 true))
1698 return FMLA;
1699 if (auto FMAD =
1700 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1701 Intrinsic::aarch64_sve_fmad>(IC, II,
1702 false))
1703 return FMAD;
1704 if (auto FMLA =
1705 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1706 Intrinsic::aarch64_sve_fmla>(IC, II,
1707 true))
1708 return FMLA;
1709 return std::nullopt;
1710}
1711
1712static std::optional<Instruction *>
1714 if (auto FMLA =
1715 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1716 Intrinsic::aarch64_sve_fmla>(IC, II,
1717 true))
1718 return FMLA;
1719 if (auto FMAD =
1720 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1721 Intrinsic::aarch64_sve_fmad>(IC, II,
1722 false))
1723 return FMAD;
1724 if (auto FMLA_U =
1725 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1726 Intrinsic::aarch64_sve_fmla_u>(
1727 IC, II, true))
1728 return FMLA_U;
1729 return instCombineSVEVectorBinOp(IC, II);
1730}
1731
1732static std::optional<Instruction *>
1734 if (auto II_U =
1735 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1736 return II_U;
1737 if (auto FMLS =
1738 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1739 Intrinsic::aarch64_sve_fmls>(IC, II,
1740 true))
1741 return FMLS;
1742 if (auto FMSB =
1743 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1744 Intrinsic::aarch64_sve_fnmsb>(
1745 IC, II, false))
1746 return FMSB;
1747 if (auto FMLS =
1748 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1749 Intrinsic::aarch64_sve_fmls>(IC, II,
1750 true))
1751 return FMLS;
1752 return std::nullopt;
1753}
1754
1755static std::optional<Instruction *>
1757 if (auto FMLS =
1758 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1759 Intrinsic::aarch64_sve_fmls>(IC, II,
1760 true))
1761 return FMLS;
1762 if (auto FMSB =
1763 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1764 Intrinsic::aarch64_sve_fnmsb>(
1765 IC, II, false))
1766 return FMSB;
1767 if (auto FMLS_U =
1768 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1769 Intrinsic::aarch64_sve_fmls_u>(
1770 IC, II, true))
1771 return FMLS_U;
1772 return instCombineSVEVectorBinOp(IC, II);
1773}
1774
1775static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1776 IntrinsicInst &II) {
1777 if (auto II_U =
1778 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1779 return II_U;
1780 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1781 Intrinsic::aarch64_sve_mls>(
1782 IC, II, true))
1783 return MLS;
1784 return std::nullopt;
1785}
1786
1787static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1789 Intrinsic::ID IID) {
1790 auto *OpPredicate = II.getOperand(0);
1791 auto *OpMultiplicand = II.getOperand(1);
1792 auto *OpMultiplier = II.getOperand(2);
1793
1794 // Return true if a given instruction is a unit splat value, false otherwise.
1795 auto IsUnitSplat = [](auto *I) {
1796 auto *SplatValue = getSplatValue(I);
1797 if (!SplatValue)
1798 return false;
1799 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1800 };
1801
1802 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1803 // with a unit splat value, false otherwise.
1804 auto IsUnitDup = [](auto *I) {
1805 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1806 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1807 return false;
1808
1809 auto *SplatValue = IntrI->getOperand(2);
1810 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1811 };
1812
1813 if (IsUnitSplat(OpMultiplier)) {
1814 // [f]mul pg %n, (dupx 1) => %n
1815 OpMultiplicand->takeName(&II);
1816 return IC.replaceInstUsesWith(II, OpMultiplicand);
1817 } else if (IsUnitDup(OpMultiplier)) {
1818 // [f]mul pg %n, (dup pg 1) => %n
1819 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1820 auto *DupPg = DupInst->getOperand(1);
1821 // TODO: this is naive. The optimization is still valid if DupPg
1822 // 'encompasses' OpPredicate, not only if they're the same predicate.
1823 if (OpPredicate == DupPg) {
1824 OpMultiplicand->takeName(&II);
1825 return IC.replaceInstUsesWith(II, OpMultiplicand);
1826 }
1827 }
1828
1829 return instCombineSVEVectorBinOp(IC, II);
1830}
1831
1832static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1833 IntrinsicInst &II) {
1834 Value *UnpackArg = II.getArgOperand(0);
1835 auto *RetTy = cast<ScalableVectorType>(II.getType());
1836 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1837 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1838
1839 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1840 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1841 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1842 ScalarArg =
1843 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1844 Value *NewVal =
1845 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1846 NewVal->takeName(&II);
1847 return IC.replaceInstUsesWith(II, NewVal);
1848 }
1849
1850 return std::nullopt;
1851}
1852static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1853 IntrinsicInst &II) {
1854 auto *OpVal = II.getOperand(0);
1855 auto *OpIndices = II.getOperand(1);
1856 VectorType *VTy = cast<VectorType>(II.getType());
1857
1858 // Check whether OpIndices is a constant splat value < minimal element count
1859 // of result.
1860 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1861 if (!SplatValue ||
1862 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1863 return std::nullopt;
1864
1865 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1866 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1867 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1868 auto *VectorSplat =
1869 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1870
1871 VectorSplat->takeName(&II);
1872 return IC.replaceInstUsesWith(II, VectorSplat);
1873}
1874
1875static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1876 IntrinsicInst &II) {
1877 Value *A, *B;
1878 Type *RetTy = II.getType();
1879 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1880 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1881
1882 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1883 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1884 if ((match(II.getArgOperand(0),
1885 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1886 match(II.getArgOperand(1),
1887 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1888 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1889 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1890 auto *TyA = cast<ScalableVectorType>(A->getType());
1891 if (TyA == B->getType() &&
1893 auto *SubVec = IC.Builder.CreateInsertVector(
1895 auto *ConcatVec = IC.Builder.CreateInsertVector(
1896 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1897 ConcatVec->takeName(&II);
1898 return IC.replaceInstUsesWith(II, ConcatVec);
1899 }
1900 }
1901
1902 return std::nullopt;
1903}
1904
1905static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1906 IntrinsicInst &II) {
1907 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1908 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1909 Value *A, *B;
1910 if (match(II.getArgOperand(0),
1911 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1912 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1913 m_Specific(A), m_Specific(B))))
1914 return IC.replaceInstUsesWith(
1915 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1916
1917 return std::nullopt;
1918}
1919
1920static std::optional<Instruction *>
1922 Value *Mask = II.getOperand(0);
1923 Value *BasePtr = II.getOperand(1);
1924 Value *Index = II.getOperand(2);
1925 Type *Ty = II.getType();
1926 Value *PassThru = ConstantAggregateZero::get(Ty);
1927
1928 // Replace by zero constant when all lanes are inactive
1929 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1930 return II_NA;
1931
1932 // Contiguous gather => masked load.
1933 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1934 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1935 Value *IndexBase;
1936 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1937 m_Value(IndexBase), m_SpecificInt(1)))) {
1938 Align Alignment =
1939 BasePtr->getPointerAlignment(II.getDataLayout());
1940
1941 Type *VecPtrTy = PointerType::getUnqual(Ty);
1942 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1943 BasePtr, IndexBase);
1944 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1945 CallInst *MaskedLoad =
1946 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1947 MaskedLoad->takeName(&II);
1948 return IC.replaceInstUsesWith(II, MaskedLoad);
1949 }
1950
1951 return std::nullopt;
1952}
1953
1954static std::optional<Instruction *>
1956 Value *Val = II.getOperand(0);
1957 Value *Mask = II.getOperand(1);
1958 Value *BasePtr = II.getOperand(2);
1959 Value *Index = II.getOperand(3);
1960 Type *Ty = Val->getType();
1961
1962 // Contiguous scatter => masked store.
1963 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1964 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1965 Value *IndexBase;
1966 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1967 m_Value(IndexBase), m_SpecificInt(1)))) {
1968 Align Alignment =
1969 BasePtr->getPointerAlignment(II.getDataLayout());
1970
1971 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1972 BasePtr, IndexBase);
1973 Type *VecPtrTy = PointerType::getUnqual(Ty);
1974 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1975
1976 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1977
1978 return IC.eraseInstFromFunction(II);
1979 }
1980
1981 return std::nullopt;
1982}
1983
1984static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1985 IntrinsicInst &II) {
1986 Type *Int32Ty = IC.Builder.getInt32Ty();
1987 Value *Pred = II.getOperand(0);
1988 Value *Vec = II.getOperand(1);
1989 Value *DivVec = II.getOperand(2);
1990
1991 Value *SplatValue = getSplatValue(DivVec);
1992 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1993 if (!SplatConstantInt)
1994 return std::nullopt;
1995
1996 APInt Divisor = SplatConstantInt->getValue();
1997 const int64_t DivisorValue = Divisor.getSExtValue();
1998 if (DivisorValue == -1)
1999 return std::nullopt;
2000 if (DivisorValue == 1)
2001 IC.replaceInstUsesWith(II, Vec);
2002
2003 if (Divisor.isPowerOf2()) {
2004 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2005 auto ASRD = IC.Builder.CreateIntrinsic(
2006 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2007 return IC.replaceInstUsesWith(II, ASRD);
2008 }
2009 if (Divisor.isNegatedPowerOf2()) {
2010 Divisor.negate();
2011 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2012 auto ASRD = IC.Builder.CreateIntrinsic(
2013 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2014 auto NEG = IC.Builder.CreateIntrinsic(
2015 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2016 return IC.replaceInstUsesWith(II, NEG);
2017 }
2018
2019 return std::nullopt;
2020}
2021
2022bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2023 size_t VecSize = Vec.size();
2024 if (VecSize == 1)
2025 return true;
2026 if (!isPowerOf2_64(VecSize))
2027 return false;
2028 size_t HalfVecSize = VecSize / 2;
2029
2030 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2031 RHS != Vec.end(); LHS++, RHS++) {
2032 if (*LHS != nullptr && *RHS != nullptr) {
2033 if (*LHS == *RHS)
2034 continue;
2035 else
2036 return false;
2037 }
2038 if (!AllowPoison)
2039 return false;
2040 if (*LHS == nullptr && *RHS != nullptr)
2041 *LHS = *RHS;
2042 }
2043
2044 Vec.resize(HalfVecSize);
2045 SimplifyValuePattern(Vec, AllowPoison);
2046 return true;
2047}
2048
2049// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2050// to dupqlane(f64(C)) where C is A concatenated with B
2051static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2052 IntrinsicInst &II) {
2053 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2054 if (!match(II.getOperand(0),
2055 m_Intrinsic<Intrinsic::vector_insert>(
2056 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2057 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2058 return std::nullopt;
2059 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2060
2061 // Insert the scalars into a container ordered by InsertElement index
2062 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2063 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2064 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2065 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2066 CurrentInsertElt = InsertElt->getOperand(0);
2067 }
2068
2069 bool AllowPoison =
2070 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2071 if (!SimplifyValuePattern(Elts, AllowPoison))
2072 return std::nullopt;
2073
2074 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2075 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2076 for (size_t I = 0; I < Elts.size(); I++) {
2077 if (Elts[I] == nullptr)
2078 continue;
2079 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2080 IC.Builder.getInt64(I));
2081 }
2082 if (InsertEltChain == nullptr)
2083 return std::nullopt;
2084
2085 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2086 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2087 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2088 // be narrowed back to the original type.
2089 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2090 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2091 IIScalableTy->getMinNumElements() /
2092 PatternWidth;
2093
2094 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2095 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2096 auto *WideShuffleMaskTy =
2097 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2098
2099 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2100 auto InsertSubvector = IC.Builder.CreateInsertVector(
2101 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2102 auto WideBitcast =
2103 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2104 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2105 auto WideShuffle = IC.Builder.CreateShuffleVector(
2106 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2107 auto NarrowBitcast =
2108 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2109
2110 return IC.replaceInstUsesWith(II, NarrowBitcast);
2111}
2112
2113static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2114 IntrinsicInst &II) {
2115 Value *A = II.getArgOperand(0);
2116 Value *B = II.getArgOperand(1);
2117 if (A == B)
2118 return IC.replaceInstUsesWith(II, A);
2119
2120 return std::nullopt;
2121}
2122
2123static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2124 IntrinsicInst &II) {
2125 Value *Pred = II.getOperand(0);
2126 Value *Vec = II.getOperand(1);
2127 Value *Shift = II.getOperand(2);
2128
2129 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2130 Value *AbsPred, *MergedValue;
2131 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2132 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2133 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2134 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2135
2136 return std::nullopt;
2137
2138 // Transform is valid if any of the following are true:
2139 // * The ABS merge value is an undef or non-negative
2140 // * The ABS predicate is all active
2141 // * The ABS predicate and the SRSHL predicates are the same
2142 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2143 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2144 return std::nullopt;
2145
2146 // Only valid when the shift amount is non-negative, otherwise the rounding
2147 // behaviour of SRSHL cannot be ignored.
2148 if (!match(Shift, m_NonNegative()))
2149 return std::nullopt;
2150
2151 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2152 {II.getType()}, {Pred, Vec, Shift});
2153
2154 return IC.replaceInstUsesWith(II, LSL);
2155}
2156
2157static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2158 IntrinsicInst &II) {
2159 Value *Vec = II.getOperand(0);
2160
2161 if (getSplatValue(Vec) == II.getOperand(1))
2162 return IC.replaceInstUsesWith(II, Vec);
2163
2164 return std::nullopt;
2165}
2166
2167static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2168 IntrinsicInst &II) {
2169 // If this barrier is post-dominated by identical one we can remove it
2170 auto *NI = II.getNextNonDebugInstruction();
2171 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2172 auto CanSkipOver = [](Instruction *I) {
2173 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2174 };
2175 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2176 auto *NIBB = NI->getParent();
2177 NI = NI->getNextNonDebugInstruction();
2178 if (!NI) {
2179 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2180 NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
2181 else
2182 break;
2183 }
2184 }
2185 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2186 if (NextII && II.isIdenticalTo(NextII))
2187 return IC.eraseInstFromFunction(II);
2188
2189 return std::nullopt;
2190}
2191
2192std::optional<Instruction *>
2194 IntrinsicInst &II) const {
2195 Intrinsic::ID IID = II.getIntrinsicID();
2196 switch (IID) {
2197 default:
2198 break;
2199 case Intrinsic::aarch64_dmb:
2200 return instCombineDMB(IC, II);
2201 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2202 case Intrinsic::aarch64_sve_fcvt_f16f32:
2203 case Intrinsic::aarch64_sve_fcvt_f16f64:
2204 case Intrinsic::aarch64_sve_fcvt_f32f16:
2205 case Intrinsic::aarch64_sve_fcvt_f32f64:
2206 case Intrinsic::aarch64_sve_fcvt_f64f16:
2207 case Intrinsic::aarch64_sve_fcvt_f64f32:
2208 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2209 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2210 case Intrinsic::aarch64_sve_fcvtx_f32f64:
2211 case Intrinsic::aarch64_sve_fcvtzs:
2212 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2213 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2214 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2215 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2216 case Intrinsic::aarch64_sve_fcvtzu:
2217 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2218 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2219 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2220 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2221 case Intrinsic::aarch64_sve_scvtf:
2222 case Intrinsic::aarch64_sve_scvtf_f16i32:
2223 case Intrinsic::aarch64_sve_scvtf_f16i64:
2224 case Intrinsic::aarch64_sve_scvtf_f32i64:
2225 case Intrinsic::aarch64_sve_scvtf_f64i32:
2226 case Intrinsic::aarch64_sve_ucvtf:
2227 case Intrinsic::aarch64_sve_ucvtf_f16i32:
2228 case Intrinsic::aarch64_sve_ucvtf_f16i64:
2229 case Intrinsic::aarch64_sve_ucvtf_f32i64:
2230 case Intrinsic::aarch64_sve_ucvtf_f64i32:
2232 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2233 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2234 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2235 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2236 return instCombineSVENoActiveReplace(IC, II, true);
2237 case Intrinsic::aarch64_sve_st1_scatter:
2238 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2239 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2240 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2241 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2242 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2243 case Intrinsic::aarch64_sve_st1dq:
2244 case Intrinsic::aarch64_sve_st1q_scatter_index:
2245 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2246 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2247 case Intrinsic::aarch64_sve_st1wq:
2248 case Intrinsic::aarch64_sve_stnt1:
2249 case Intrinsic::aarch64_sve_stnt1_scatter:
2250 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2251 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2252 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2253 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2254 case Intrinsic::aarch64_sve_st2:
2255 case Intrinsic::aarch64_sve_st2q:
2256 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2257 case Intrinsic::aarch64_sve_st3:
2258 case Intrinsic::aarch64_sve_st3q:
2259 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2260 case Intrinsic::aarch64_sve_st4:
2261 case Intrinsic::aarch64_sve_st4q:
2262 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2263 case Intrinsic::aarch64_sve_addqv:
2264 case Intrinsic::aarch64_sve_and_z:
2265 case Intrinsic::aarch64_sve_bic_z:
2266 case Intrinsic::aarch64_sve_brka_z:
2267 case Intrinsic::aarch64_sve_brkb_z:
2268 case Intrinsic::aarch64_sve_brkn_z:
2269 case Intrinsic::aarch64_sve_brkpa_z:
2270 case Intrinsic::aarch64_sve_brkpb_z:
2271 case Intrinsic::aarch64_sve_cntp:
2272 case Intrinsic::aarch64_sve_compact:
2273 case Intrinsic::aarch64_sve_eor_z:
2274 case Intrinsic::aarch64_sve_eorv:
2275 case Intrinsic::aarch64_sve_eorqv:
2276 case Intrinsic::aarch64_sve_nand_z:
2277 case Intrinsic::aarch64_sve_nor_z:
2278 case Intrinsic::aarch64_sve_orn_z:
2279 case Intrinsic::aarch64_sve_orr_z:
2280 case Intrinsic::aarch64_sve_orv:
2281 case Intrinsic::aarch64_sve_orqv:
2282 case Intrinsic::aarch64_sve_pnext:
2283 case Intrinsic::aarch64_sve_rdffr_z:
2284 case Intrinsic::aarch64_sve_saddv:
2285 case Intrinsic::aarch64_sve_uaddv:
2286 case Intrinsic::aarch64_sve_umaxv:
2287 case Intrinsic::aarch64_sve_umaxqv:
2288 case Intrinsic::aarch64_sve_cmpeq:
2289 case Intrinsic::aarch64_sve_cmpeq_wide:
2290 case Intrinsic::aarch64_sve_cmpge:
2291 case Intrinsic::aarch64_sve_cmpge_wide:
2292 case Intrinsic::aarch64_sve_cmpgt:
2293 case Intrinsic::aarch64_sve_cmpgt_wide:
2294 case Intrinsic::aarch64_sve_cmphi:
2295 case Intrinsic::aarch64_sve_cmphi_wide:
2296 case Intrinsic::aarch64_sve_cmphs:
2297 case Intrinsic::aarch64_sve_cmphs_wide:
2298 case Intrinsic::aarch64_sve_cmple_wide:
2299 case Intrinsic::aarch64_sve_cmplo_wide:
2300 case Intrinsic::aarch64_sve_cmpls_wide:
2301 case Intrinsic::aarch64_sve_cmplt_wide:
2302 case Intrinsic::aarch64_sve_facge:
2303 case Intrinsic::aarch64_sve_facgt:
2304 case Intrinsic::aarch64_sve_fcmpeq:
2305 case Intrinsic::aarch64_sve_fcmpge:
2306 case Intrinsic::aarch64_sve_fcmpgt:
2307 case Intrinsic::aarch64_sve_fcmpne:
2308 case Intrinsic::aarch64_sve_fcmpuo:
2309 case Intrinsic::aarch64_sve_ld1_gather:
2310 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2311 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2312 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2313 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2314 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2315 case Intrinsic::aarch64_sve_ld1q_gather_index:
2316 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2317 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2318 case Intrinsic::aarch64_sve_ld1ro:
2319 case Intrinsic::aarch64_sve_ld1rq:
2320 case Intrinsic::aarch64_sve_ld1udq:
2321 case Intrinsic::aarch64_sve_ld1uwq:
2322 case Intrinsic::aarch64_sve_ld2_sret:
2323 case Intrinsic::aarch64_sve_ld2q_sret:
2324 case Intrinsic::aarch64_sve_ld3_sret:
2325 case Intrinsic::aarch64_sve_ld3q_sret:
2326 case Intrinsic::aarch64_sve_ld4_sret:
2327 case Intrinsic::aarch64_sve_ld4q_sret:
2328 case Intrinsic::aarch64_sve_ldff1:
2329 case Intrinsic::aarch64_sve_ldff1_gather:
2330 case Intrinsic::aarch64_sve_ldff1_gather_index:
2331 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2332 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2333 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2334 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2335 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2336 case Intrinsic::aarch64_sve_ldnf1:
2337 case Intrinsic::aarch64_sve_ldnt1:
2338 case Intrinsic::aarch64_sve_ldnt1_gather:
2339 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2340 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2341 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2342 return instCombineSVENoActiveZero(IC, II);
2343 case Intrinsic::aarch64_sve_prf:
2344 case Intrinsic::aarch64_sve_prfb_gather_index:
2345 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2346 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2347 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2348 case Intrinsic::aarch64_sve_prfd_gather_index:
2349 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2350 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2351 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2352 case Intrinsic::aarch64_sve_prfh_gather_index:
2353 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2354 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2355 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2356 case Intrinsic::aarch64_sve_prfw_gather_index:
2357 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2358 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2359 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2360 return instCombineSVENoActiveUnaryErase(IC, II, 0);
2361 case Intrinsic::aarch64_neon_fmaxnm:
2362 case Intrinsic::aarch64_neon_fminnm:
2363 return instCombineMaxMinNM(IC, II);
2364 case Intrinsic::aarch64_sve_convert_from_svbool:
2365 return instCombineConvertFromSVBool(IC, II);
2366 case Intrinsic::aarch64_sve_dup:
2367 return instCombineSVEDup(IC, II);
2368 case Intrinsic::aarch64_sve_dup_x:
2369 return instCombineSVEDupX(IC, II);
2370 case Intrinsic::aarch64_sve_cmpne:
2371 case Intrinsic::aarch64_sve_cmpne_wide:
2372 return instCombineSVECmpNE(IC, II);
2373 case Intrinsic::aarch64_sve_rdffr:
2374 return instCombineRDFFR(IC, II);
2375 case Intrinsic::aarch64_sve_lasta:
2376 case Intrinsic::aarch64_sve_lastb:
2377 return instCombineSVELast(IC, II);
2378 case Intrinsic::aarch64_sve_clasta_n:
2379 case Intrinsic::aarch64_sve_clastb_n:
2380 return instCombineSVECondLast(IC, II);
2381 case Intrinsic::aarch64_sve_cntd:
2382 return instCombineSVECntElts(IC, II, 2);
2383 case Intrinsic::aarch64_sve_cntw:
2384 return instCombineSVECntElts(IC, II, 4);
2385 case Intrinsic::aarch64_sve_cnth:
2386 return instCombineSVECntElts(IC, II, 8);
2387 case Intrinsic::aarch64_sve_cntb:
2388 return instCombineSVECntElts(IC, II, 16);
2389 case Intrinsic::aarch64_sve_ptest_any:
2390 case Intrinsic::aarch64_sve_ptest_first:
2391 case Intrinsic::aarch64_sve_ptest_last:
2392 return instCombineSVEPTest(IC, II);
2393 case Intrinsic::aarch64_sve_fabd:
2394 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2395 case Intrinsic::aarch64_sve_fadd:
2396 return instCombineSVEVectorFAdd(IC, II);
2397 case Intrinsic::aarch64_sve_fadd_u:
2398 return instCombineSVEVectorFAddU(IC, II);
2399 case Intrinsic::aarch64_sve_fdiv:
2400 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2401 case Intrinsic::aarch64_sve_fmax:
2402 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2403 case Intrinsic::aarch64_sve_fmaxnm:
2404 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2405 case Intrinsic::aarch64_sve_fmin:
2406 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2407 case Intrinsic::aarch64_sve_fminnm:
2408 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2409 case Intrinsic::aarch64_sve_fmla:
2410 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2411 case Intrinsic::aarch64_sve_fmls:
2412 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2413 case Intrinsic::aarch64_sve_fmul:
2414 if (auto II_U =
2415 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2416 return II_U;
2417 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2418 case Intrinsic::aarch64_sve_fmul_u:
2419 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2420 case Intrinsic::aarch64_sve_fmulx:
2421 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2422 case Intrinsic::aarch64_sve_fnmla:
2423 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2424 case Intrinsic::aarch64_sve_fnmls:
2425 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2426 case Intrinsic::aarch64_sve_fsub:
2427 return instCombineSVEVectorFSub(IC, II);
2428 case Intrinsic::aarch64_sve_fsub_u:
2429 return instCombineSVEVectorFSubU(IC, II);
2430 case Intrinsic::aarch64_sve_add:
2431 return instCombineSVEVectorAdd(IC, II);
2432 case Intrinsic::aarch64_sve_add_u:
2433 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2434 Intrinsic::aarch64_sve_mla_u>(
2435 IC, II, true);
2436 case Intrinsic::aarch64_sve_mla:
2437 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2438 case Intrinsic::aarch64_sve_mls:
2439 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2440 case Intrinsic::aarch64_sve_mul:
2441 if (auto II_U =
2442 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2443 return II_U;
2444 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2445 case Intrinsic::aarch64_sve_mul_u:
2446 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2447 case Intrinsic::aarch64_sve_sabd:
2448 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2449 case Intrinsic::aarch64_sve_smax:
2450 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2451 case Intrinsic::aarch64_sve_smin:
2452 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2453 case Intrinsic::aarch64_sve_smulh:
2454 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2455 case Intrinsic::aarch64_sve_sub:
2456 return instCombineSVEVectorSub(IC, II);
2457 case Intrinsic::aarch64_sve_sub_u:
2458 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2459 Intrinsic::aarch64_sve_mls_u>(
2460 IC, II, true);
2461 case Intrinsic::aarch64_sve_uabd:
2462 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2463 case Intrinsic::aarch64_sve_umax:
2464 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2465 case Intrinsic::aarch64_sve_umin:
2466 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2467 case Intrinsic::aarch64_sve_umulh:
2468 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2469 case Intrinsic::aarch64_sve_asr:
2470 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2471 case Intrinsic::aarch64_sve_lsl:
2472 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2473 case Intrinsic::aarch64_sve_lsr:
2474 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2475 case Intrinsic::aarch64_sve_and:
2476 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2477 case Intrinsic::aarch64_sve_bic:
2478 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2479 case Intrinsic::aarch64_sve_eor:
2480 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2481 case Intrinsic::aarch64_sve_orr:
2482 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2483 case Intrinsic::aarch64_sve_sqsub:
2484 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2485 case Intrinsic::aarch64_sve_uqsub:
2486 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2487 case Intrinsic::aarch64_sve_tbl:
2488 return instCombineSVETBL(IC, II);
2489 case Intrinsic::aarch64_sve_uunpkhi:
2490 case Intrinsic::aarch64_sve_uunpklo:
2491 case Intrinsic::aarch64_sve_sunpkhi:
2492 case Intrinsic::aarch64_sve_sunpklo:
2493 return instCombineSVEUnpack(IC, II);
2494 case Intrinsic::aarch64_sve_uzp1:
2495 return instCombineSVEUzp1(IC, II);
2496 case Intrinsic::aarch64_sve_zip1:
2497 case Intrinsic::aarch64_sve_zip2:
2498 return instCombineSVEZip(IC, II);
2499 case Intrinsic::aarch64_sve_ld1_gather_index:
2500 return instCombineLD1GatherIndex(IC, II);
2501 case Intrinsic::aarch64_sve_st1_scatter_index:
2502 return instCombineST1ScatterIndex(IC, II);
2503 case Intrinsic::aarch64_sve_ld1:
2504 return instCombineSVELD1(IC, II, DL);
2505 case Intrinsic::aarch64_sve_st1:
2506 return instCombineSVEST1(IC, II, DL);
2507 case Intrinsic::aarch64_sve_sdiv:
2508 return instCombineSVESDIV(IC, II);
2509 case Intrinsic::aarch64_sve_sel:
2510 return instCombineSVESel(IC, II);
2511 case Intrinsic::aarch64_sve_srshl:
2512 return instCombineSVESrshl(IC, II);
2513 case Intrinsic::aarch64_sve_dupq_lane:
2514 return instCombineSVEDupqLane(IC, II);
2515 case Intrinsic::aarch64_sve_insr:
2516 return instCombineSVEInsr(IC, II);
2517 }
2518
2519 return std::nullopt;
2520}
2521
2523 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2524 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2525 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2526 SimplifyAndSetOp) const {
2527 switch (II.getIntrinsicID()) {
2528 default:
2529 break;
2530 case Intrinsic::aarch64_neon_fcvtxn:
2531 case Intrinsic::aarch64_neon_rshrn:
2532 case Intrinsic::aarch64_neon_sqrshrn:
2533 case Intrinsic::aarch64_neon_sqrshrun:
2534 case Intrinsic::aarch64_neon_sqshrn:
2535 case Intrinsic::aarch64_neon_sqshrun:
2536 case Intrinsic::aarch64_neon_sqxtn:
2537 case Intrinsic::aarch64_neon_sqxtun:
2538 case Intrinsic::aarch64_neon_uqrshrn:
2539 case Intrinsic::aarch64_neon_uqshrn:
2540 case Intrinsic::aarch64_neon_uqxtn:
2541 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2542 break;
2543 }
2544
2545 return std::nullopt;
2546}
2547
2549 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2551}
2552
2555 switch (K) {
2557 return TypeSize::getFixed(64);
2559 if (ST->useSVEForFixedLengthVectors() &&
2561 return TypeSize::getFixed(
2562 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2563 else if (ST->isNeonAvailable())
2564 return TypeSize::getFixed(128);
2565 else
2566 return TypeSize::getFixed(0);
2568 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2570 return TypeSize::getScalable(128);
2571 else
2572 return TypeSize::getScalable(0);
2573 }
2574 llvm_unreachable("Unsupported register kind");
2575}
2576
2577bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2579 Type *SrcOverrideTy) {
2580 // A helper that returns a vector type from the given type. The number of
2581 // elements in type Ty determines the vector width.
2582 auto toVectorTy = [&](Type *ArgTy) {
2583 return VectorType::get(ArgTy->getScalarType(),
2584 cast<VectorType>(DstTy)->getElementCount());
2585 };
2586
2587 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2588 // i32, i64]. SVE doesn't generally have the same set of instructions to
2589 // perform an extend with the add/sub/mul. There are SMULLB style
2590 // instructions, but they operate on top/bottom, requiring some sort of lane
2591 // interleaving to be used with zext/sext.
2592 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2593 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2594 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2595 return false;
2596
2597 // Determine if the operation has a widening variant. We consider both the
2598 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2599 // instructions.
2600 //
2601 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2602 // verify that their extending operands are eliminated during code
2603 // generation.
2604 Type *SrcTy = SrcOverrideTy;
2605 switch (Opcode) {
2606 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2607 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2608 // The second operand needs to be an extend
2609 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2610 if (!SrcTy)
2611 SrcTy =
2612 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2613 } else
2614 return false;
2615 break;
2616 case Instruction::Mul: { // SMULL(2), UMULL(2)
2617 // Both operands need to be extends of the same type.
2618 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2619 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2620 if (!SrcTy)
2621 SrcTy =
2622 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2623 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2624 // If one of the operands is a Zext and the other has enough zero bits to
2625 // be treated as unsigned, we can still general a umull, meaning the zext
2626 // is free.
2627 KnownBits Known =
2628 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2629 if (Args[0]->getType()->getScalarSizeInBits() -
2630 Known.Zero.countLeadingOnes() >
2631 DstTy->getScalarSizeInBits() / 2)
2632 return false;
2633 if (!SrcTy)
2634 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2635 DstTy->getScalarSizeInBits() / 2));
2636 } else
2637 return false;
2638 break;
2639 }
2640 default:
2641 return false;
2642 }
2643
2644 // Legalize the destination type and ensure it can be used in a widening
2645 // operation.
2646 auto DstTyL = getTypeLegalizationCost(DstTy);
2647 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2648 return false;
2649
2650 // Legalize the source type and ensure it can be used in a widening
2651 // operation.
2652 assert(SrcTy && "Expected some SrcTy");
2653 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2654 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2655 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2656 return false;
2657
2658 // Get the total number of vector elements in the legalized types.
2659 InstructionCost NumDstEls =
2660 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2661 InstructionCost NumSrcEls =
2662 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2663
2664 // Return true if the legalized types have the same number of vector elements
2665 // and the destination element type size is twice that of the source type.
2666 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2667}
2668
2669// s/urhadd instructions implement the following pattern, making the
2670// extends free:
2671// %x = add ((zext i8 -> i16), 1)
2672// %y = (zext i8 -> i16)
2673// trunc i16 (lshr (add %x, %y), 1) -> i8
2674//
2676 Type *Src) {
2677 // The source should be a legal vector type.
2678 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2679 (Src->isScalableTy() && !ST->hasSVE2()))
2680 return false;
2681
2682 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2683 return false;
2684
2685 // Look for trunc/shl/add before trying to match the pattern.
2686 const Instruction *Add = ExtUser;
2687 auto *AddUser =
2688 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2689 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2690 Add = AddUser;
2691
2692 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2693 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2694 return false;
2695
2696 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2697 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2698 Src->getScalarSizeInBits() !=
2699 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2700 return false;
2701
2702 // Try to match the whole pattern. Ext could be either the first or second
2703 // m_ZExtOrSExt matched.
2704 Instruction *Ex1, *Ex2;
2705 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2706 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2707 return false;
2708
2709 // Ensure both extends are of the same type
2710 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2711 Ex1->getOpcode() == Ex2->getOpcode())
2712 return true;
2713
2714 return false;
2715}
2716
2718 Type *Src,
2721 const Instruction *I) {
2722 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2723 assert(ISD && "Invalid opcode");
2724 // If the cast is observable, and it is used by a widening instruction (e.g.,
2725 // uaddl, saddw, etc.), it may be free.
2726 if (I && I->hasOneUser()) {
2727 auto *SingleUser = cast<Instruction>(*I->user_begin());
2728 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2729 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2730 // For adds only count the second operand as free if both operands are
2731 // extends but not the same operation. (i.e both operands are not free in
2732 // add(sext, zext)).
2733 if (SingleUser->getOpcode() == Instruction::Add) {
2734 if (I == SingleUser->getOperand(1) ||
2735 (isa<CastInst>(SingleUser->getOperand(1)) &&
2736 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2737 return 0;
2738 } else // Others are free so long as isWideningInstruction returned true.
2739 return 0;
2740 }
2741
2742 // The cast will be free for the s/urhadd instructions
2743 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2744 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2745 return 0;
2746 }
2747
2748 // TODO: Allow non-throughput costs that aren't binary.
2749 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2751 return Cost == 0 ? 0 : 1;
2752 return Cost;
2753 };
2754
2755 EVT SrcTy = TLI->getValueType(DL, Src);
2756 EVT DstTy = TLI->getValueType(DL, Dst);
2757
2758 if (!SrcTy.isSimple() || !DstTy.isSimple())
2759 return AdjustCost(
2760 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2761
2762 static const TypeConversionCostTblEntry ConversionTbl[] = {
2763 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2764 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2765 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2766 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2767 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2768 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2769 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2770 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2771 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2772 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2773 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2774 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2775 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2776 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2777 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2778 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2779 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2780 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2781 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2782 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2783
2784 // Truncations on nxvmiN
2785 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
2786 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
2787 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
2788 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
2789 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
2790 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
2791 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
2792 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
2793 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
2794 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
2795 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
2796 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
2797 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
2798 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
2799 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
2800 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
2801 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
2802 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
2803 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
2804 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
2805 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
2806 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
2807 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
2808 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
2809 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
2810 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
2811 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
2812 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
2813 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
2814 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
2815 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
2816 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
2817 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
2818
2819 // The number of shll instructions for the extension.
2820 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2821 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2822 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2823 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2824 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2825 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2826 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2827 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2828 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2829 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2830 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2831 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2832 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2833 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2834 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2835 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2836
2837 // FP Ext and trunc
2838 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
2839 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
2840 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
2841 // FP16
2842 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
2843 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
2844 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
2845 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
2846 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
2847 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
2848 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
2849 // FP Ext and trunc
2850 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
2851 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
2852 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
2853 // FP16
2854 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
2855 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
2856 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
2857 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
2858 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
2859 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
2860 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
2861
2862 // LowerVectorINT_TO_FP:
2863 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2864 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2865 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2866 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2867 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2868 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2869
2870 // Complex: to v2f32
2871 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2872 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2873 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2874 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2875 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2876 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2877
2878 // Complex: to v4f32
2879 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
2880 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2881 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
2882 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2883
2884 // Complex: to v8f32
2885 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2886 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2887 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2888 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2889
2890 // Complex: to v16f32
2891 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2892 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2893
2894 // Complex: to v2f64
2895 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2896 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2897 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2898 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2899 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2900 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2901
2902 // Complex: to v4f64
2903 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2904 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2905
2906 // LowerVectorFP_TO_INT
2907 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
2908 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
2909 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
2910 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
2911 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
2912 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
2913
2914 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2915 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
2916 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
2917 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
2918 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
2919 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
2920 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
2921
2922 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2923 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
2924 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
2925 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
2926 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
2927
2928 // Complex, from nxv2f32.
2929 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2930 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2931 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2932 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2933 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2934 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2935 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2936 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2937
2938 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2939 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
2940 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
2941 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
2942 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
2943 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
2944 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
2945
2946 // Complex, from nxv2f64.
2947 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2948 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2949 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2950 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2951 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2952 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2953 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2954 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2955
2956 // Complex, from nxv4f32.
2957 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
2958 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
2959 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
2960 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
2961 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
2962 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
2963 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
2964 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
2965
2966 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2967 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
2968 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
2969 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
2970 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
2971
2972 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2973 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
2974 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
2975 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
2976 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
2977 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
2978 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
2979
2980 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2981 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
2982 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
2983 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
2984 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
2985
2986 // Complex, from nxv8f16.
2987 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
2988 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
2989 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
2990 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
2991 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
2992 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
2993 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
2994 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
2995
2996 // Complex, from nxv4f16.
2997 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
2998 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
2999 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3000 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3001 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3002 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3003 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3004 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3005
3006 // Complex, from nxv2f16.
3007 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3008 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3009 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3010 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3011 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3012 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3013 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3014 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3015
3016 // Truncate from nxvmf32 to nxvmf16.
3017 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3018 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3019 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3020
3021 // Truncate from nxvmf64 to nxvmf16.
3022 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3023 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3024 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3025
3026 // Truncate from nxvmf64 to nxvmf32.
3027 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3028 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3029 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3030
3031 // Extend from nxvmf16 to nxvmf32.
3032 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3033 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3034 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3035
3036 // Extend from nxvmf16 to nxvmf64.
3037 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3038 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3039 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3040
3041 // Extend from nxvmf32 to nxvmf64.
3042 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3043 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3044 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3045
3046 // Bitcasts from float to integer
3047 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3048 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3049 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3050
3051 // Bitcasts from integer to float
3052 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3053 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3054 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3055
3056 // Add cost for extending to illegal -too wide- scalable vectors.
3057 // zero/sign extend are implemented by multiple unpack operations,
3058 // where each operation has a cost of 1.
3059 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3060 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3061 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3062 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3063 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3064 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3065
3066 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3067 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3068 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3069 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3070 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3071 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3072 };
3073
3074 // We have to estimate a cost of fixed length operation upon
3075 // SVE registers(operations) with the number of registers required
3076 // for a fixed type to be represented upon SVE registers.
3077 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3078 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3079 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3080 ST->useSVEForFixedLengthVectors(WiderTy)) {
3081 std::pair<InstructionCost, MVT> LT =
3082 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3083 unsigned NumElements =
3084 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3085 return AdjustCost(
3086 LT.first *
3088 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3089 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3090 CostKind, I));
3091 }
3092
3093 if (const auto *Entry = ConvertCostTableLookup(
3094 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3095 return AdjustCost(Entry->Cost);
3096
3097 static const TypeConversionCostTblEntry FP16Tbl[] = {
3098 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3099 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3100 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3101 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3102 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3103 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3104 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3105 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3106 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3107 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3108 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3109 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3110 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3111 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3112 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3113 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3114 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3115 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3116 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3117 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3118 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3119 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3120 };
3121
3122 if (ST->hasFullFP16())
3123 if (const auto *Entry = ConvertCostTableLookup(
3124 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3125 return AdjustCost(Entry->Cost);
3126
3127 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3130 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3132 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3134 // The standard behaviour in the backend for these cases is to split the
3135 // extend up into two parts:
3136 // 1. Perform an extending load or masked load up to the legal type.
3137 // 2. Extend the loaded data to the final type.
3138 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3139 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3141 Opcode, LegalTy, Src, CCH, CostKind, I);
3143 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3144 return Part1 + Part2;
3145 }
3146
3147 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3148 // but we also want to include the TTI::CastContextHint::Masked case too.
3149 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3151 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3153
3154 return AdjustCost(
3155 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3156}
3157
3159 Type *Dst,
3160 VectorType *VecTy,
3161 unsigned Index) {
3162
3163 // Make sure we were given a valid extend opcode.
3164 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3165 "Invalid opcode");
3166
3167 // We are extending an element we extract from a vector, so the source type
3168 // of the extend is the element type of the vector.
3169 auto *Src = VecTy->getElementType();
3170
3171 // Sign- and zero-extends are for integer types only.
3172 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3173
3174 // Get the cost for the extract. We compute the cost (if any) for the extend
3175 // below.
3177 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3178 CostKind, Index, nullptr, nullptr);
3179
3180 // Legalize the types.
3181 auto VecLT = getTypeLegalizationCost(VecTy);
3182 auto DstVT = TLI->getValueType(DL, Dst);
3183 auto SrcVT = TLI->getValueType(DL, Src);
3184
3185 // If the resulting type is still a vector and the destination type is legal,
3186 // we may get the extension for free. If not, get the default cost for the
3187 // extend.
3188 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3189 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3190 CostKind);
3191
3192 // The destination type should be larger than the element type. If not, get
3193 // the default cost for the extend.
3194 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3195 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3196 CostKind);
3197
3198 switch (Opcode) {
3199 default:
3200 llvm_unreachable("Opcode should be either SExt or ZExt");
3201
3202 // For sign-extends, we only need a smov, which performs the extension
3203 // automatically.
3204 case Instruction::SExt:
3205 return Cost;
3206
3207 // For zero-extends, the extend is performed automatically by a umov unless
3208 // the destination type is i64 and the element type is i8 or i16.
3209 case Instruction::ZExt:
3210 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3211 return Cost;
3212 }
3213
3214 // If we are unable to perform the extend for free, get the default cost.
3215 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3216 CostKind);
3217}
3218
3221 const Instruction *I) {
3223 return Opcode == Instruction::PHI ? 0 : 1;
3224 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3225 // Branches are assumed to be predicted.
3226 return 0;
3227}
3228
3229InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3230 unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3231 const Instruction *I, Value *Scalar,
3232 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3233 assert(Val->isVectorTy() && "This must be a vector type");
3234
3235 if (Index != -1U) {
3236 // Legalize the type.
3237 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3238
3239 // This type is legalized to a scalar type.
3240 if (!LT.second.isVector())
3241 return 0;
3242
3243 // The type may be split. For fixed-width vectors we can normalize the
3244 // index to the new type.
3245 if (LT.second.isFixedLengthVector()) {
3246 unsigned Width = LT.second.getVectorNumElements();
3247 Index = Index % Width;
3248 }
3249
3250 // The element at index zero is already inside the vector.
3251 // - For a physical (HasRealUse==true) insert-element or extract-element
3252 // instruction that extracts integers, an explicit FPR -> GPR move is
3253 // needed. So it has non-zero cost.
3254 // - For the rest of cases (virtual instruction or element type is float),
3255 // consider the instruction free.
3256 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3257 return 0;
3258
3259 // This is recognising a LD1 single-element structure to one lane of one
3260 // register instruction. I.e., if this is an `insertelement` instruction,
3261 // and its second operand is a load, then we will generate a LD1, which
3262 // are expensive instructions.
3263 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3264 return ST->getVectorInsertExtractBaseCost() + 1;
3265
3266 // i1 inserts and extract will include an extra cset or cmp of the vector
3267 // value. Increase the cost by 1 to account.
3268 if (Val->getScalarSizeInBits() == 1)
3269 return ST->getVectorInsertExtractBaseCost() + 1;
3270
3271 // FIXME:
3272 // If the extract-element and insert-element instructions could be
3273 // simplified away (e.g., could be combined into users by looking at use-def
3274 // context), they have no cost. This is not done in the first place for
3275 // compile-time considerations.
3276 }
3277
3278 // In case of Neon, if there exists extractelement from lane != 0 such that
3279 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3280 // 2. extractelement result feeds into fmul.
3281 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3282 // equivalent to 0.
3283 // then the extractelement can be merged with fmul in the backend and it
3284 // incurs no cost.
3285 // e.g.
3286 // define double @foo(<2 x double> %a) {
3287 // %1 = extractelement <2 x double> %a, i32 0
3288 // %2 = extractelement <2 x double> %a, i32 1
3289 // %res = fmul double %1, %2
3290 // ret double %res
3291 // }
3292 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3293 auto ExtractCanFuseWithFmul = [&]() {
3294 // We bail out if the extract is from lane 0.
3295 if (Index == 0)
3296 return false;
3297
3298 // Check if the scalar element type of the vector operand of ExtractElement
3299 // instruction is one of the allowed types.
3300 auto IsAllowedScalarTy = [&](const Type *T) {
3301 return T->isFloatTy() || T->isDoubleTy() ||
3302 (T->isHalfTy() && ST->hasFullFP16());
3303 };
3304
3305 // Check if the extractelement user is scalar fmul.
3306 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3307 // Check if the user is scalar fmul.
3308 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3309 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3310 !BO->getType()->isVectorTy();
3311 };
3312
3313 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3314 // certain scalar type and a certain vector register width.
3315 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3316 auto RegWidth =
3318 .getFixedValue();
3319 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3320 };
3321
3322 // Check if the type constraints on input vector type and result scalar type
3323 // of extractelement instruction are satisfied.
3324 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3325 return false;
3326
3327 if (Scalar) {
3328 DenseMap<User *, unsigned> UserToExtractIdx;
3329 for (auto *U : Scalar->users()) {
3330 if (!IsUserFMulScalarTy(U))
3331 return false;
3332 // Recording entry for the user is important. Index value is not
3333 // important.
3334 UserToExtractIdx[U];
3335 }
3336 if (UserToExtractIdx.empty())
3337 return false;
3338 for (auto &[S, U, L] : ScalarUserAndIdx) {
3339 for (auto *U : S->users()) {
3340 if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3341 auto *FMul = cast<BinaryOperator>(U);
3342 auto *Op0 = FMul->getOperand(0);
3343 auto *Op1 = FMul->getOperand(1);
3344 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3345 UserToExtractIdx[U] = L;
3346 break;
3347 }
3348 }
3349 }
3350 }
3351 for (auto &[U, L] : UserToExtractIdx) {
3352 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3353 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3354 return false;
3355 }
3356 } else {
3357 const auto *EE = cast<ExtractElementInst>(I);
3358
3359 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3360 if (!IdxOp)
3361 return false;
3362
3363 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3364 if (!IsUserFMulScalarTy(U))
3365 return false;
3366
3367 // Check if the other operand of extractelement is also extractelement
3368 // from lane equivalent to 0.
3369 const auto *BO = cast<BinaryOperator>(U);
3370 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3371 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3372 if (OtherEE) {
3373 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3374 if (!IdxOp)
3375 return false;
3376 return IsExtractLaneEquivalentToZero(
3377 cast<ConstantInt>(OtherEE->getIndexOperand())
3378 ->getValue()
3379 .getZExtValue(),
3380 OtherEE->getType()->getScalarSizeInBits());
3381 }
3382 return true;
3383 });
3384 }
3385 return true;
3386 };
3387
3388 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3389 ExtractCanFuseWithFmul())
3390 return 0;
3391
3392 // All other insert/extracts cost this much.
3393 return ST->getVectorInsertExtractBaseCost();
3394}
3395
3398 unsigned Index, Value *Op0,
3399 Value *Op1) {
3400 bool HasRealUse =
3401 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3402 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3403}
3404
3406 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3407 Value *Scalar,
3408 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3409 return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3410 ScalarUserAndIdx);
3411}
3412
3414 Type *Val,
3416 unsigned Index) {
3417 return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3418 true /* HasRealUse */, &I);
3419}
3420
3422 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3424 if (isa<ScalableVectorType>(Ty))
3426 if (Ty->getElementType()->isFloatingPointTy())
3427 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3428 CostKind);
3429 return DemandedElts.popcount() * (Insert + Extract) *
3431}
3432
3434 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3437 const Instruction *CxtI) {
3438
3439 // The code-generator is currently not able to handle scalable vectors
3440 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3441 // it. This change will be removed when code-generation for these types is
3442 // sufficiently reliable.
3443 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3444 if (VTy->getElementCount() == ElementCount::getScalable(1))
3446
3447 // TODO: Handle more cost kinds.
3449 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3450 Op2Info, Args, CxtI);
3451
3452 // Legalize the type.
3453 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3454 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3455
3456 switch (ISD) {
3457 default:
3458 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3459 Op2Info);
3460 case ISD::SDIV:
3461 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3462 // On AArch64, scalar signed division by constants power-of-two are
3463 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3464 // The OperandValue properties many not be same as that of previous
3465 // operation; conservatively assume OP_None.
3467 Instruction::Add, Ty, CostKind,
3468 Op1Info.getNoProps(), Op2Info.getNoProps());
3469 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3470 Op1Info.getNoProps(), Op2Info.getNoProps());
3472 Instruction::Select, Ty, CostKind,
3473 Op1Info.getNoProps(), Op2Info.getNoProps());
3474 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3475 Op1Info.getNoProps(), Op2Info.getNoProps());
3476 return Cost;
3477 }
3478 [[fallthrough]];
3479 case ISD::UDIV: {
3480 auto VT = TLI->getValueType(DL, Ty);
3481 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3482 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3483 // Vector signed division by constant are expanded to the
3484 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3485 // to MULHS + SUB + SRL + ADD + SRL.
3487 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3489 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3491 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3492 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3493 }
3494 }
3495
3496 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
3497 // emitted by the backend even when those functions are not declared in the
3498 // module.
3499 if (!VT.isVector() && VT.getSizeInBits() > 64)
3500 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3501
3503 Opcode, Ty, CostKind, Op1Info, Op2Info);
3504 if (Ty->isVectorTy()) {
3505 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3506 // SDIV/UDIV operations are lowered using SVE, then we can have less
3507 // costs.
3508 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3509 ->getPrimitiveSizeInBits()
3510 .getFixedValue() < 128) {
3511 EVT VT = TLI->getValueType(DL, Ty);
3512 static const CostTblEntry DivTbl[]{
3513 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3514 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3515 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3516 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3517 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3518 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3519
3520 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3521 if (nullptr != Entry)
3522 return Entry->Cost;
3523 }
3524 // For 8/16-bit elements, the cost is higher because the type
3525 // requires promotion and possibly splitting:
3526 if (LT.second.getScalarType() == MVT::i8)
3527 Cost *= 8;
3528 else if (LT.second.getScalarType() == MVT::i16)
3529 Cost *= 4;
3530 return Cost;
3531 } else {
3532 // If one of the operands is a uniform constant then the cost for each
3533 // element is Cost for insertion, extraction and division.
3534 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3535 // operation with scalar type
3536 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3537 (Op2Info.isConstant() && Op2Info.isUniform())) {
3538 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3540 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3541 return (4 + DivCost) * VTy->getNumElements();
3542 }
3543 }
3544 // On AArch64, without SVE, vector divisions are expanded
3545 // into scalar divisions of each pair of elements.
3546 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3547 CostKind, Op1Info, Op2Info);
3548 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3549 Op1Info, Op2Info);
3550 }
3551
3552 // TODO: if one of the arguments is scalar, then it's not necessary to
3553 // double the cost of handling the vector elements.
3554 Cost += Cost;
3555 }
3556 return Cost;
3557 }
3558 case ISD::MUL:
3559 // When SVE is available, then we can lower the v2i64 operation using
3560 // the SVE mul instruction, which has a lower cost.
3561 if (LT.second == MVT::v2i64 && ST->hasSVE())
3562 return LT.first;
3563
3564 // When SVE is not available, there is no MUL.2d instruction,
3565 // which means mul <2 x i64> is expensive as elements are extracted
3566 // from the vectors and the muls scalarized.
3567 // As getScalarizationOverhead is a bit too pessimistic, we
3568 // estimate the cost for a i64 vector directly here, which is:
3569 // - four 2-cost i64 extracts,
3570 // - two 2-cost i64 inserts, and
3571 // - two 1-cost muls.
3572 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3573 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3574 // need to scalarize so the cost can be cheaper (smull or umull).
3575 // so the cost can be cheaper (smull or umull).
3576 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3577 return LT.first;
3578 return LT.first * 14;
3579 case ISD::ADD:
3580 case ISD::XOR:
3581 case ISD::OR:
3582 case ISD::AND:
3583 case ISD::SRL:
3584 case ISD::SRA:
3585 case ISD::SHL:
3586 // These nodes are marked as 'custom' for combining purposes only.
3587 // We know that they are legal. See LowerAdd in ISelLowering.
3588 return LT.first;
3589
3590 case ISD::FNEG:
3591 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3592 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3593 (Ty->isHalfTy() && ST->hasFullFP16())) &&
3594 CxtI &&
3595 ((CxtI->hasOneUse() &&
3596 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3597 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3598 return 0;
3599 [[fallthrough]];
3600 case ISD::FADD:
3601 case ISD::FSUB:
3602 // Increase the cost for half and bfloat types if not architecturally
3603 // supported.
3604 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3605 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3606 return 2 * LT.first;
3607 if (!Ty->getScalarType()->isFP128Ty())
3608 return LT.first;
3609 [[fallthrough]];
3610 case ISD::FMUL:
3611 case ISD::FDIV:
3612 // These nodes are marked as 'custom' just to lower them to SVE.
3613 // We know said lowering will incur no additional cost.
3614 if (!Ty->getScalarType()->isFP128Ty())
3615 return 2 * LT.first;
3616
3617 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3618 Op2Info);
3619 case ISD::FREM:
3620 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3621 // those functions are not declared in the module.
3622 if (!Ty->isVectorTy())
3623 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3624 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3625 Op2Info);
3626 }
3627}
3628
3630 ScalarEvolution *SE,
3631 const SCEV *Ptr) {
3632 // Address computations in vectorized code with non-consecutive addresses will
3633 // likely result in more instructions compared to scalar code where the
3634 // computation can more often be merged into the index mode. The resulting
3635 // extra micro-ops can significantly decrease throughput.
3636 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3637 int MaxMergeDistance = 64;
3638
3639 if (Ty->isVectorTy() && SE &&
3640 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3641 return NumVectorInstToHideOverhead;
3642
3643 // In many cases the address computation is not merged into the instruction
3644 // addressing mode.
3645 return 1;
3646}
3647
3649 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3651 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3652 // TODO: Handle other cost kinds.
3654 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3655 Op1Info, Op2Info, I);
3656
3657 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3658 // We don't lower some vector selects well that are wider than the register
3659 // width.
3660 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3661 // We would need this many instructions to hide the scalarization happening.
3662 const int AmortizationCost = 20;
3663
3664 // If VecPred is not set, check if we can get a predicate from the context
3665 // instruction, if its type matches the requested ValTy.
3666 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3667 CmpPredicate CurrentPred;
3668 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3669 m_Value())))
3670 VecPred = CurrentPred;
3671 }
3672 // Check if we have a compare/select chain that can be lowered using
3673 // a (F)CMxx & BFI pair.
3674 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3675 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3676 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3677 VecPred == CmpInst::FCMP_UNE) {
3678 static const auto ValidMinMaxTys = {
3679 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3680 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3681 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3682
3683 auto LT = getTypeLegalizationCost(ValTy);
3684 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3685 (ST->hasFullFP16() &&
3686 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3687 return LT.first;
3688 }
3689
3690 static const TypeConversionCostTblEntry
3691 VectorSelectTbl[] = {
3692 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3693 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3694 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3695 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3696 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3697 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3698 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3699 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3700 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3701 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3702 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3703 };
3704
3705 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3706 EVT SelValTy = TLI->getValueType(DL, ValTy);
3707 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3708 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3709 SelCondTy.getSimpleVT(),
3710 SelValTy.getSimpleVT()))
3711 return Entry->Cost;
3712 }
3713 }
3714
3715 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3716 auto LT = getTypeLegalizationCost(ValTy);
3717 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3718 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3719 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3720 }
3721
3722 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3723 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3724 // be profitable.
3725 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3726 ICmpInst::isEquality(VecPred) &&
3727 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3728 match(I->getOperand(1), m_Zero()) &&
3729 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3730 return 0;
3731
3732 // The base case handles scalable vectors fine for now, since it treats the
3733 // cost as 1 * legalization cost.
3734 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3735 Op1Info, Op2Info, I);
3736}
3737
3739AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3741 if (ST->requiresStrictAlign()) {
3742 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3743 // a bunch of instructions when strict align is enabled.
3744 return Options;
3745 }
3746 Options.AllowOverlappingLoads = true;
3747 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3748 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3749 // TODO: Though vector loads usually perform well on AArch64, in some targets
3750 // they may wake up the FP unit, which raises the power consumption. Perhaps
3751 // they could be used with no holds barred (-O3).
3752 Options.LoadSizes = {8, 4, 2, 1};
3753 Options.AllowedTailExpansions = {3, 5, 6};
3754 return Options;
3755}
3756
3758 return ST->hasSVE();
3759}
3760
3763 Align Alignment, unsigned AddressSpace,
3765 if (useNeonVector(Src))
3766 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3767 CostKind);
3768 auto LT = getTypeLegalizationCost(Src);
3769 if (!LT.first.isValid())
3771
3772 // Return an invalid cost for element types that we are unable to lower.
3773 auto *VT = cast<VectorType>(Src);
3774 if (VT->getElementType()->isIntegerTy(1))
3776
3777 // The code-generator is currently not able to handle scalable vectors
3778 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3779 // it. This change will be removed when code-generation for these types is
3780 // sufficiently reliable.
3781 if (VT->getElementCount() == ElementCount::getScalable(1))
3783
3784 return LT.first;
3785}
3786
3787// This function returns gather/scatter overhead either from
3788// user-provided value or specialized values per-target from \p ST.
3789static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3790 const AArch64Subtarget *ST) {
3791 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3792 "Should be called on only load or stores.");
3793 switch (Opcode) {
3794 case Instruction::Load:
3795 if (SVEGatherOverhead.getNumOccurrences() > 0)
3796 return SVEGatherOverhead;
3797 return ST->getGatherOverhead();
3798 break;
3799 case Instruction::Store:
3800 if (SVEScatterOverhead.getNumOccurrences() > 0)
3801 return SVEScatterOverhead;
3802 return ST->getScatterOverhead();
3803 break;
3804 default:
3805 llvm_unreachable("Shouldn't have reached here");
3806 }
3807}
3808
3810 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3811 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3812 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3813 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3814 Alignment, CostKind, I);
3815 auto *VT = cast<VectorType>(DataTy);
3816 auto LT = getTypeLegalizationCost(DataTy);
3817 if (!LT.first.isValid())
3819
3820 // Return an invalid cost for element types that we are unable to lower.
3821 if (!LT.second.isVector() ||
3822 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3823 VT->getElementType()->isIntegerTy(1))
3825
3826 // The code-generator is currently not able to handle scalable vectors
3827 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3828 // it. This change will be removed when code-generation for these types is
3829 // sufficiently reliable.
3830 if (VT->getElementCount() == ElementCount::getScalable(1))
3832
3833 ElementCount LegalVF = LT.second.getVectorElementCount();
3834 InstructionCost MemOpCost =
3835 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3836 {TTI::OK_AnyValue, TTI::OP_None}, I);
3837 // Add on an overhead cost for using gathers/scatters.
3838 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3839 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3840}
3841
3843 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3844}
3845
3847 MaybeAlign Alignment,
3848 unsigned AddressSpace,
3850 TTI::OperandValueInfo OpInfo,
3851 const Instruction *I) {
3852 EVT VT = TLI->getValueType(DL, Ty, true);
3853 // Type legalization can't handle structs
3854 if (VT == MVT::Other)
3855 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3856 CostKind);
3857
3858 auto LT = getTypeLegalizationCost(Ty);
3859 if (!LT.first.isValid())
3861
3862 // The code-generator is currently not able to handle scalable vectors
3863 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3864 // it. This change will be removed when code-generation for these types is
3865 // sufficiently reliable.
3866 // We also only support full register predicate loads and stores.
3867 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3868 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3869 (VTy->getElementType()->isIntegerTy(1) &&
3870 !VTy->getElementCount().isKnownMultipleOf(
3873
3874 // TODO: consider latency as well for TCK_SizeAndLatency.
3876 return LT.first;
3877
3879 return 1;
3880
3881 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3882 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3883 // Unaligned stores are extremely inefficient. We don't split all
3884 // unaligned 128-bit stores because the negative impact that has shown in
3885 // practice on inlined block copy code.
3886 // We make such stores expensive so that we will only vectorize if there
3887 // are 6 other instructions getting vectorized.
3888 const int AmortizationCost = 6;
3889
3890 return LT.first * 2 * AmortizationCost;
3891 }
3892
3893 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3894 if (Ty->isPtrOrPtrVectorTy())
3895 return LT.first;
3896
3897 if (useNeonVector(Ty)) {
3898 // Check truncating stores and extending loads.
3899 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3900 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3901 if (VT == MVT::v4i8)
3902 return 2;
3903 // Otherwise we need to scalarize.
3904 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3905 }
3906 EVT EltVT = VT.getVectorElementType();
3907 unsigned EltSize = EltVT.getScalarSizeInBits();
3908 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3909 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3910 *Alignment != Align(1))
3911 return LT.first;
3912 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3913 // widening to v4i8, which produces suboptimal results.
3914 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3915 return LT.first;
3916
3917 // Check non-power-of-2 loads/stores for legal vector element types with
3918 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3919 // operations on smaller power-of-2 ops, including ld1/st1.
3920 LLVMContext &C = Ty->getContext();
3922 SmallVector<EVT> TypeWorklist;
3923 TypeWorklist.push_back(VT);
3924 while (!TypeWorklist.empty()) {
3925 EVT CurrVT = TypeWorklist.pop_back_val();
3926 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3927 if (isPowerOf2_32(CurrNumElements)) {
3928 Cost += 1;
3929 continue;
3930 }
3931
3932 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3933 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3934 TypeWorklist.push_back(
3935 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3936 }
3937 return Cost;
3938 }
3939
3940 return LT.first;
3941}
3942
3944 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3945 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3946 bool UseMaskForCond, bool UseMaskForGaps) {
3947 assert(Factor >= 2 && "Invalid interleave factor");
3948 auto *VecVTy = cast<VectorType>(VecTy);
3949
3950 if (VecTy->isScalableTy() && !ST->hasSVE())
3952
3953 // Vectorization for masked interleaved accesses is only enabled for scalable
3954 // VF.
3955 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3957
3958 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3959 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3960 auto *SubVecTy =
3961 VectorType::get(VecVTy->getElementType(),
3962 VecVTy->getElementCount().divideCoefficientBy(Factor));
3963
3964 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3965 // Accesses having vector types that are a multiple of 128 bits can be
3966 // matched to more than one ldN/stN instruction.
3967 bool UseScalable;
3968 if (MinElts % Factor == 0 &&
3969 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3970 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3971 }
3972
3973 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3974 Alignment, AddressSpace, CostKind,
3975 UseMaskForCond, UseMaskForGaps);
3976}
3977
3982 for (auto *I : Tys) {
3983 if (!I->isVectorTy())
3984 continue;
3985 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3986 128)
3987 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3988 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3989 }
3990 return Cost;
3991}
3992
3994 return ST->getMaxInterleaveFactor();
3995}
3996
3997// For Falkor, we want to avoid having too many strided loads in a loop since
3998// that can exhaust the HW prefetcher resources. We adjust the unroller
3999// MaxCount preference below to attempt to ensure unrolling doesn't create too
4000// many strided loads.
4001static void
4004 enum { MaxStridedLoads = 7 };
4005 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4006 int StridedLoads = 0;
4007 // FIXME? We could make this more precise by looking at the CFG and
4008 // e.g. not counting loads in each side of an if-then-else diamond.
4009 for (const auto BB : L->blocks()) {
4010 for (auto &I : *BB) {
4011 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4012 if (!LMemI)
4013 continue;
4014
4015 Value *PtrValue = LMemI->getPointerOperand();
4016 if (L->isLoopInvariant(PtrValue))
4017 continue;
4018
4019 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4020 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4021 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4022 continue;
4023
4024 // FIXME? We could take pairing of unrolled load copies into account
4025 // by looking at the AddRec, but we would probably have to limit this
4026 // to loops with no stores or other memory optimization barriers.
4027 ++StridedLoads;
4028 // We've seen enough strided loads that seeing more won't make a
4029 // difference.
4030 if (StridedLoads > MaxStridedLoads / 2)
4031 return StridedLoads;
4032 }
4033 }
4034 return StridedLoads;
4035 };
4036
4037 int StridedLoads = countStridedLoads(L, SE);
4038 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4039 << " strided loads\n");
4040 // Pick the largest power of 2 unroll count that won't result in too many
4041 // strided loads.
4042 if (StridedLoads) {
4043 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4044 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4045 << UP.MaxCount << '\n');
4046 }
4047}
4048
4049/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4050/// OOO engine's wide instruction window and various predictors.
4051static void
4055 // Limit loops with structure that is highly likely to benefit from runtime
4056 // unrolling; that is we exclude outer loops, loops with multiple exits and
4057 // many blocks (i.e. likely with complex control flow). Note that the
4058 // heuristics here may be overly conservative and we err on the side of
4059 // avoiding runtime unrolling rather than unroll excessively. They are all
4060 // subject to further refinement.
4061 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4062 return;
4063
4064 const SCEV *BTC = SE.getBackedgeTakenCount(L);
4065 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4066 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4067 SE.getSmallConstantMaxTripCount(L) <= 32))
4068 return;
4069 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4070 return;
4071
4072 int64_t Size = 0;
4073 for (auto *BB : L->getBlocks()) {
4074 for (auto &I : *BB) {
4075 if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4076 return;
4077 SmallVector<const Value *, 4> Operands(I.operand_values());
4078 Size +=
4080 }
4081 }
4082
4083 // Limit to loops with trip counts that are cheap to expand.
4084 UP.SCEVExpansionBudget = 1;
4085
4086 // Try to unroll small, single block loops, if they have load/store
4087 // dependencies, to expose more parallel memory access streams.
4088 if (L->getHeader() != L->getLoopLatch() || Size > 8)
4089 return;
4090
4091 SmallPtrSet<Value *, 8> LoadedValues;
4093 for (auto *BB : L->blocks()) {
4094 for (auto &I : *BB) {
4096 if (!Ptr)
4097 continue;
4098 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4099 if (SE.isLoopInvariant(PtrSCEV, L))
4100 continue;
4101 if (isa<LoadInst>(&I))
4102 LoadedValues.insert(&I);
4103 else
4104 Stores.push_back(cast<StoreInst>(&I));
4105 }
4106 }
4107
4108 // Try to find an unroll count that maximizes the use of the instruction
4109 // window, i.e. trying to fetch as many instructions per cycle as possible.
4110 unsigned MaxInstsPerLine = 16;
4111 unsigned UC = 1;
4112 unsigned BestUC = 1;
4113 unsigned SizeWithBestUC = BestUC * Size;
4114 while (UC <= 8) {
4115 unsigned SizeWithUC = UC * Size;
4116 if (SizeWithUC > 48)
4117 break;
4118 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4119 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4120 BestUC = UC;
4121 SizeWithBestUC = BestUC * Size;
4122 }
4123 UC++;
4124 }
4125
4126 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4127 return LoadedValues.contains(SI->getOperand(0));
4128 }))
4129 return;
4130
4131 UP.Runtime = true;
4132 UP.DefaultUnrollRuntimeCount = BestUC;
4133}
4134
4138 // Enable partial unrolling and runtime unrolling.
4139 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4140
4141 UP.UpperBound = true;
4142
4143 // For inner loop, it is more likely to be a hot one, and the runtime check
4144 // can be promoted out from LICM pass, so the overhead is less, let's try
4145 // a larger threshold to unroll more loops.
4146 if (L->getLoopDepth() > 1)
4147 UP.PartialThreshold *= 2;
4148
4149 // Disable partial & runtime unrolling on -Os.
4151
4152 // Apply subtarget-specific unrolling preferences.
4153 switch (ST->getProcFamily()) {
4154 case AArch64Subtarget::AppleA14:
4155 case AArch64Subtarget::AppleA15:
4156 case AArch64Subtarget::AppleA16:
4157 case AArch64Subtarget::AppleM4:
4158 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4159 break;
4160 case AArch64Subtarget::Falkor:
4163 break;
4164 default:
4165 break;
4166 }
4167
4168 // Scan the loop: don't unroll loops with calls as this could prevent
4169 // inlining. Don't unroll vector loops either, as they don't benefit much from
4170 // unrolling.
4171 for (auto *BB : L->getBlocks()) {
4172 for (auto &I : *BB) {
4173 // Don't unroll vectorised loop.
4174 if (I.getType()->isVectorTy())
4175 return;
4176
4177 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4178 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4179 if (!isLoweredToCall(F))
4180 continue;
4181 }
4182 return;
4183 }
4184 }
4185 }
4186
4187 // Enable runtime unrolling for in-order models
4188 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4189 // checking for that case, we can ensure that the default behaviour is
4190 // unchanged
4192 !ST->getSchedModel().isOutOfOrder()) {
4193 UP.Runtime = true;
4194 UP.Partial = true;
4195 UP.UnrollRemainder = true;
4197
4198 UP.UnrollAndJam = true;
4200 }
4201}
4202
4206}
4207
4209 Type *ExpectedType) {
4210 switch (Inst->getIntrinsicID()) {
4211 default:
4212 return nullptr;
4213 case Intrinsic::aarch64_neon_st2:
4214 case Intrinsic::aarch64_neon_st3:
4215 case Intrinsic::aarch64_neon_st4: {
4216 // Create a struct type
4217 StructType *ST = dyn_cast<StructType>(ExpectedType);
4218 if (!ST)
4219 return nullptr;
4220 unsigned NumElts = Inst->arg_size() - 1;
4221 if (ST->getNumElements() != NumElts)
4222 return nullptr;
4223 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4224 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4225 return nullptr;
4226 }
4227 Value *Res = PoisonValue::get(ExpectedType);
4228 IRBuilder<> Builder(Inst);
4229 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4230 Value *L = Inst->getArgOperand(i);
4231 Res = Builder.CreateInsertValue(Res, L, i);
4232 }
4233 return Res;
4234 }
4235 case Intrinsic::aarch64_neon_ld2:
4236 case Intrinsic::aarch64_neon_ld3:
4237 case Intrinsic::aarch64_neon_ld4:
4238 if (Inst->getType() == ExpectedType)
4239 return Inst;
4240 return nullptr;
4241 }
4242}
4243
4245 MemIntrinsicInfo &Info) {
4246 switch (Inst->getIntrinsicID()) {
4247 default:
4248 break;
4249 case Intrinsic::aarch64_neon_ld2:
4250 case Intrinsic::aarch64_neon_ld3:
4251 case Intrinsic::aarch64_neon_ld4:
4252 Info.ReadMem = true;
4253 Info.WriteMem = false;
4254 Info.PtrVal = Inst->getArgOperand(0);
4255 break;
4256 case Intrinsic::aarch64_neon_st2:
4257 case Intrinsic::aarch64_neon_st3:
4258 case Intrinsic::aarch64_neon_st4:
4259 Info.ReadMem = false;
4260 Info.WriteMem = true;
4261 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
4262 break;
4263 }
4264
4265 switch (Inst->getIntrinsicID()) {
4266 default:
4267 return false;
4268 case Intrinsic::aarch64_neon_ld2:
4269 case Intrinsic::aarch64_neon_st2:
4270 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4271 break;
4272 case Intrinsic::aarch64_neon_ld3:
4273 case Intrinsic::aarch64_neon_st3:
4274 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4275 break;
4276 case Intrinsic::aarch64_neon_ld4:
4277 case Intrinsic::aarch64_neon_st4:
4278 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4279 break;
4280 }
4281 return true;
4282}
4283
4284/// See if \p I should be considered for address type promotion. We check if \p
4285/// I is a sext with right type and used in memory accesses. If it used in a
4286/// "complex" getelementptr, we allow it to be promoted without finding other
4287/// sext instructions that sign extended the same initial value. A getelementptr
4288/// is considered as "complex" if it has more than 2 operands.
4290 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
4291 bool Considerable = false;
4292 AllowPromotionWithoutCommonHeader = false;
4293 if (!isa<SExtInst>(&I))
4294 return false;
4295 Type *ConsideredSExtType =
4296 Type::getInt64Ty(I.getParent()->getParent()->getContext());
4297 if (I.getType() != ConsideredSExtType)
4298 return false;
4299 // See if the sext is the one with the right type and used in at least one
4300 // GetElementPtrInst.
4301 for (const User *U : I.users()) {
4302 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
4303 Considerable = true;
4304 // A getelementptr is considered as "complex" if it has more than 2
4305 // operands. We will promote a SExt used in such complex GEP as we
4306 // expect some computation to be merged if they are done on 64 bits.
4307 if (GEPInst->getNumOperands() > 2) {
4308 AllowPromotionWithoutCommonHeader = true;
4309 break;
4310 }
4311 }
4312 }
4313 return Considerable;
4314}
4315
4317 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
4318 if (!VF.isScalable())
4319 return true;
4320
4321 Type *Ty = RdxDesc.getRecurrenceType();
4323 return false;
4324
4325 switch (RdxDesc.getRecurrenceKind()) {
4326 case RecurKind::Add:
4327 case RecurKind::FAdd:
4328 case RecurKind::And:
4329 case RecurKind::Or:
4330 case RecurKind::Xor:
4331 case RecurKind::SMin:
4332 case RecurKind::SMax:
4333 case RecurKind::UMin:
4334 case RecurKind::UMax:
4335 case RecurKind::FMin:
4336 case RecurKind::FMax:
4337 case RecurKind::FMulAdd:
4338 case RecurKind::IAnyOf:
4339 case RecurKind::FAnyOf:
4340 return true;
4341 default:
4342 return false;
4343 }
4344}
4345
4348 FastMathFlags FMF,
4350 // The code-generator is currently not able to handle scalable vectors
4351 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4352 // it. This change will be removed when code-generation for these types is
4353 // sufficiently reliable.
4354 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4355 if (VTy->getElementCount() == ElementCount::getScalable(1))
4357
4358 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4359
4360 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4361 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
4362
4363 InstructionCost LegalizationCost = 0;
4364 if (LT.first > 1) {
4365 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
4366 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
4367 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
4368 }
4369
4370 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
4371}
4372
4374 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
4375 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4376 InstructionCost LegalizationCost = 0;
4377 if (LT.first > 1) {
4378 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
4379 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
4380 LegalizationCost *= LT.first - 1;
4381 }
4382
4383 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4384 assert(ISD && "Invalid opcode");
4385 // Add the final reduction cost for the legal horizontal reduction
4386 switch (ISD) {
4387 case ISD::ADD:
4388 case ISD::AND:
4389 case ISD::OR:
4390 case ISD::XOR:
4391 case ISD::FADD:
4392 return LegalizationCost + 2;
4393 default:
4395 }
4396}
4397
4400 std::optional<FastMathFlags> FMF,
4402 // The code-generator is currently not able to handle scalable vectors
4403 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4404 // it. This change will be removed when code-generation for these types is
4405 // sufficiently reliable.
4406 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4407 if (VTy->getElementCount() == ElementCount::getScalable(1))
4409
4411 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4412 InstructionCost BaseCost =
4413 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4414 // Add on extra cost to reflect the extra overhead on some CPUs. We still
4415 // end up vectorizing for more computationally intensive loops.
4416 return BaseCost + FixedVTy->getNumElements();
4417 }
4418
4419 if (Opcode != Instruction::FAdd)
4421
4422 auto *VTy = cast<ScalableVectorType>(ValTy);
4424 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
4425 Cost *= getMaxNumElements(VTy->getElementCount());
4426 return Cost;
4427 }
4428
4429 if (isa<ScalableVectorType>(ValTy))
4430 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
4431
4432 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4433 MVT MTy = LT.second;
4434 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4435 assert(ISD && "Invalid opcode");
4436
4437 // Horizontal adds can use the 'addv' instruction. We model the cost of these
4438 // instructions as twice a normal vector add, plus 1 for each legalization
4439 // step (LT.first). This is the only arithmetic vector reduction operation for
4440 // which we have an instruction.
4441 // OR, XOR and AND costs should match the codegen from:
4442 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
4443 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4444 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4445 static const CostTblEntry CostTblNoPairwise[]{
4446 {ISD::ADD, MVT::v8i8, 2},
4447 {ISD::ADD, MVT::v16i8, 2},
4448 {ISD::ADD, MVT::v4i16, 2},
4449 {ISD::ADD, MVT::v8i16, 2},
4450 {ISD::ADD, MVT::v4i32, 2},
4451 {ISD::ADD, MVT::v2i64, 2},
4452 {ISD::OR, MVT::v8i8, 15},
4453 {ISD::OR, MVT::v16i8, 17},
4454 {ISD::OR, MVT::v4i16, 7},
4455 {ISD::OR, MVT::v8i16, 9},
4456 {ISD::OR, MVT::v2i32, 3},
4457 {ISD::OR, MVT::v4i32, 5},
4458 {ISD::OR, MVT::v2i64, 3},
4459 {ISD::XOR, MVT::v8i8, 15},
4460 {ISD::XOR, MVT::v16i8, 17},
4461 {ISD::XOR, MVT::v4i16, 7},
4462 {ISD::XOR, MVT::v8i16, 9},
4463 {ISD::XOR, MVT::v2i32, 3},
4464 {ISD::XOR, MVT::v4i32, 5},
4465 {ISD::XOR, MVT::v2i64, 3},
4466 {ISD::AND, MVT::v8i8, 15},
4467 {ISD::AND, MVT::v16i8, 17},
4468 {ISD::AND, MVT::v4i16, 7},
4469 {ISD::AND, MVT::v8i16, 9},
4470 {ISD::AND, MVT::v2i32, 3},
4471 {ISD::AND, MVT::v4i32, 5},
4472 {ISD::AND, MVT::v2i64, 3},
4473 };
4474 switch (ISD) {
4475 default:
4476 break;
4477 case ISD::FADD:
4478 if (Type *EltTy = ValTy->getScalarType();
4479 // FIXME: For half types without fullfp16 support, this could extend and
4480 // use a fp32 faddp reduction but current codegen unrolls.
4481 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4482 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4483 const unsigned NElts = MTy.getVectorNumElements();
4484 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
4485 isPowerOf2_32(NElts))
4486 // Reduction corresponding to series of fadd instructions is lowered to
4487 // series of faddp instructions. faddp has latency/throughput that
4488 // matches fadd instruction and hence, every faddp instruction can be
4489 // considered to have a relative cost = 1 with
4490 // CostKind = TCK_RecipThroughput.
4491 // An faddp will pairwise add vector elements, so the size of input
4492 // vector reduces by half every time, requiring
4493 // #(faddp instructions) = log2_32(NElts).
4494 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
4495 }
4496 break;
4497 case ISD::ADD:
4498 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4499 return (LT.first - 1) + Entry->Cost;
4500 break;
4501 case ISD::XOR:
4502 case ISD::AND:
4503 case ISD::OR:
4504 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4505 if (!Entry)
4506 break;
4507 auto *ValVTy = cast<FixedVectorType>(ValTy);
4508 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4509 isPowerOf2_32(ValVTy->getNumElements())) {
4510 InstructionCost ExtraCost = 0;
4511 if (LT.first != 1) {
4512 // Type needs to be split, so there is an extra cost of LT.first - 1
4513 // arithmetic ops.
4514 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4515 MTy.getVectorNumElements());
4516 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4517 ExtraCost *= LT.first - 1;
4518 }
4519 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4520 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4521 return Cost + ExtraCost;
4522 }
4523 break;
4524 }
4525 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4526}
4527
4529 static const CostTblEntry ShuffleTbl[] = {
4530 { TTI::SK_Splice, MVT::nxv16i8, 1 },
4531 { TTI::SK_Splice, MVT::nxv8i16, 1 },
4532 { TTI::SK_Splice, MVT::nxv4i32, 1 },
4533 { TTI::SK_Splice, MVT::nxv2i64, 1 },
4534 { TTI::SK_Splice, MVT::nxv2f16, 1 },
4535 { TTI::SK_Splice, MVT::nxv4f16, 1 },
4536 { TTI::SK_Splice, MVT::nxv8f16, 1 },
4537 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4538 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4539 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4540 { TTI::SK_Splice, MVT::nxv2f32, 1 },
4541 { TTI::SK_Splice, MVT::nxv4f32, 1 },
4542 { TTI::SK_Splice, MVT::nxv2f64, 1 },
4543 };
4544
4545 // The code-generator is currently not able to handle scalable vectors
4546 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4547 // it. This change will be removed when code-generation for these types is
4548 // sufficiently reliable.
4551
4552 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4553 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4555 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4556 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4557 : LT.second;
4558 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4559 InstructionCost LegalizationCost = 0;
4560 if (Index < 0) {
4561 LegalizationCost =
4562 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4564 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4566 }
4567
4568 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4569 // Cost performed on a promoted type.
4570 if (LT.second.getScalarType() == MVT::i1) {
4571 LegalizationCost +=
4572 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4574 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4576 }
4577 const auto *Entry =
4578 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4579 assert(Entry && "Illegal Type for Splice");
4580 LegalizationCost += Entry->Cost;
4581 return LegalizationCost * LT.first;
4582}
4583
4586 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4587 ArrayRef<const Value *> Args, const Instruction *CxtI) {
4588 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4589
4590 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4591 // into smaller vectors and sum the cost of each shuffle.
4592 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4593 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4594 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4595
4596 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4597 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4598 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4599 // cost than just the load.
4600 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4603 return std::max<InstructionCost>(1, LT.first / 4);
4604
4605 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4606 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4607 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4608 // cost than just the store.
4609 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4611 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4613 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4614 return LT.first;
4615
4616 unsigned TpNumElts = Mask.size();
4617 unsigned LTNumElts = LT.second.getVectorNumElements();
4618 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4619 VectorType *NTp =
4620 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4622 for (unsigned N = 0; N < NumVecs; N++) {
4623 SmallVector<int> NMask;
4624 // Split the existing mask into chunks of size LTNumElts. Track the source
4625 // sub-vectors to ensure the result has at most 2 inputs.
4626 unsigned Source1, Source2;
4627 unsigned NumSources = 0;
4628 for (unsigned E = 0; E < LTNumElts; E++) {
4629 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4631 if (MaskElt < 0) {
4633 continue;
4634 }
4635
4636 // Calculate which source from the input this comes from and whether it
4637 // is new to us.
4638 unsigned Source = MaskElt / LTNumElts;
4639 if (NumSources == 0) {
4640 Source1 = Source;
4641 NumSources = 1;
4642 } else if (NumSources == 1 && Source != Source1) {
4643 Source2 = Source;
4644 NumSources = 2;
4645 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4646 NumSources++;
4647 }
4648
4649 // Add to the new mask. For the NumSources>2 case these are not correct,
4650 // but are only used for the modular lane number.
4651 if (Source == Source1)
4652 NMask.push_back(MaskElt % LTNumElts);
4653 else if (Source == Source2)
4654 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4655 else
4656 NMask.push_back(MaskElt % LTNumElts);
4657 }
4658 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4659 // getShuffleCost. If not then cost it using the worst case as the number
4660 // of element moves into a new vector.
4661 if (NumSources <= 2)
4662 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4664 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4665 else
4666 Cost += LTNumElts;
4667 }
4668 return Cost;
4669 }
4670
4671 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4672 // Treat extractsubvector as single op permutation.
4673 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4674 if (IsExtractSubvector && LT.second.isFixedLengthVector())
4676
4677 // Check for broadcast loads, which are supported by the LD1R instruction.
4678 // In terms of code-size, the shuffle vector is free when a load + dup get
4679 // folded into a LD1R. That's what we check and return here. For performance
4680 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4681 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4682 // that we model the load + dup sequence slightly higher because LD1R is a
4683 // high latency instruction.
4684 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4685 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4686 if (IsLoad && LT.second.isVector() &&
4688 LT.second.getVectorElementCount()))
4689 return 0;
4690 }
4691
4692 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4693 // from the perfect shuffle tables.
4694 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4695 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4696 all_of(Mask, [](int E) { return E < 8; }))
4697 return getPerfectShuffleCost(Mask);
4698
4699 // Check for identity masks, which we can treat as free.
4700 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4701 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4702 all_of(enumerate(Mask), [](const auto &M) {
4703 return M.value() < 0 || M.value() == (int)M.index();
4704 }))
4705 return 0;
4706
4707 // Check for other shuffles that are not SK_ kinds but we have native
4708 // instructions for, for example ZIP and UZP.
4709 unsigned Unused;
4710 if (LT.second.isFixedLengthVector() &&
4711 LT.second.getVectorNumElements() == Mask.size() &&
4712 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4713 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4714 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4715 // Check for non-zero lane splats
4716 all_of(drop_begin(Mask),
4717 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4718 return 1;
4719
4720 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4721 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4722 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4723 static const CostTblEntry ShuffleTbl[] = {
4724 // Broadcast shuffle kinds can be performed with 'dup'.
4725 {TTI::SK_Broadcast, MVT::v8i8, 1},
4726 {TTI::SK_Broadcast, MVT::v16i8, 1},
4727 {TTI::SK_Broadcast, MVT::v4i16, 1},
4728 {TTI::SK_Broadcast, MVT::v8i16, 1},
4729 {TTI::SK_Broadcast, MVT::v2i32, 1},
4730 {TTI::SK_Broadcast, MVT::v4i32, 1},
4731 {TTI::SK_Broadcast, MVT::v2i64, 1},
4732 {TTI::SK_Broadcast, MVT::v4f16, 1},
4733 {TTI::SK_Broadcast, MVT::v8f16, 1},
4734 {TTI::SK_Broadcast, MVT::v2f32, 1},
4735 {TTI::SK_Broadcast, MVT::v4f32, 1},
4736 {TTI::SK_Broadcast, MVT::v2f64, 1},
4737 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4738 // 'zip1/zip2' instructions.
4739 {TTI::SK_Transpose, MVT::v8i8, 1},
4740 {TTI::SK_Transpose, MVT::v16i8, 1},
4741 {TTI::SK_Transpose, MVT::v4i16, 1},
4742 {TTI::SK_Transpose, MVT::v8i16, 1},
4743 {TTI::SK_Transpose, MVT::v2i32, 1},
4744 {TTI::SK_Transpose, MVT::v4i32, 1},
4745 {TTI::SK_Transpose, MVT::v2i64, 1},
4746 {TTI::SK_Transpose, MVT::v4f16, 1},
4747 {TTI::SK_Transpose, MVT::v8f16, 1},
4748 {TTI::SK_Transpose, MVT::v2f32, 1},
4749 {TTI::SK_Transpose, MVT::v4f32, 1},
4750 {TTI::SK_Transpose, MVT::v2f64, 1},
4751 // Select shuffle kinds.
4752 // TODO: handle vXi8/vXi16.
4753 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4754 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4755 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4756 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4757 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4758 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4759 // PermuteSingleSrc shuffle kinds.
4760 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4761 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4762 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4763 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4764 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4765 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4766 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4767 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4768 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4769 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4770 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4771 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4772 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4773 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4774 // Reverse can be lowered with `rev`.
4775 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4776 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4777 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4778 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4779 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4780 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4781 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4782 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4783 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4784 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4785 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4786 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4787 // Splice can all be lowered as `ext`.
4788 {TTI::SK_Splice, MVT::v2i32, 1},
4789 {TTI::SK_Splice, MVT::v4i32, 1},
4790 {TTI::SK_Splice, MVT::v2i64, 1},
4791 {TTI::SK_Splice, MVT::v2f32, 1},
4792 {TTI::SK_Splice, MVT::v4f32, 1},
4793 {TTI::SK_Splice, MVT::v2f64, 1},
4794 {TTI::SK_Splice, MVT::v8f16, 1},
4795 {TTI::SK_Splice, MVT::v8bf16, 1},
4796 {TTI::SK_Splice, MVT::v8i16, 1},
4797 {TTI::SK_Splice, MVT::v16i8, 1},
4798 {TTI::SK_Splice, MVT::v4bf16, 1},
4799 {TTI::SK_Splice, MVT::v4f16, 1},
4800 {TTI::SK_Splice, MVT::v4i16, 1},
4801 {TTI::SK_Splice, MVT::v8i8, 1},
4802 // Broadcast shuffle kinds for scalable vectors
4803 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4804 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4805 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4806 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4807 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4808 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4809 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4810 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4811 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4812 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4813 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4814 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4815 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4816 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4817 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4818 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4819 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4820 // Handle the cases for vector.reverse with scalable vectors
4821 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4822 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4823 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4824 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4825 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4826 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4827 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4828 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4829 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4830 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4831 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4832 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4833 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4834 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4835 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4836 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4837 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4838 };
4839 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4840 return LT.first * Entry->Cost;
4841 }
4842
4843 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4844 return getSpliceCost(Tp, Index);
4845
4846 // Inserting a subvector can often be done with either a D, S or H register
4847 // move, so long as the inserted vector is "aligned".
4848 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4849 LT.second.getSizeInBits() <= 128 && SubTp) {
4850 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4851 if (SubLT.second.isVector()) {
4852 int NumElts = LT.second.getVectorNumElements();
4853 int NumSubElts = SubLT.second.getVectorNumElements();
4854 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4855 return SubLT.first;
4856 }
4857 }
4858
4859 // Restore optimal kind.
4860 if (IsExtractSubvector)
4862 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4863 CxtI);
4864}
4865
4868 const auto &Strides = DenseMap<Value *, const SCEV *>();
4869 for (BasicBlock *BB : TheLoop->blocks()) {
4870 // Scan the instructions in the block and look for addresses that are
4871 // consecutive and decreasing.
4872 for (Instruction &I : *BB) {
4873 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4875 Type *AccessTy = getLoadStoreType(&I);
4876 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4877 /*ShouldCheckWrap=*/false)
4878 .value_or(0) < 0)
4879 return true;
4880 }
4881 }
4882 }
4883 return false;
4884}
4885
4887 return ST->getEpilogueVectorizationMinVF();
4888}
4889
4891 if (!ST->hasSVE())
4892 return false;
4893
4894 // We don't currently support vectorisation with interleaving for SVE - with
4895 // such loops we're better off not using tail-folding. This gives us a chance
4896 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4897 if (TFI->IAI->hasGroups())
4898 return false;
4899
4901 if (TFI->LVL->getReductionVars().size())
4902 Required |= TailFoldingOpts::Reductions;
4903 if (TFI->LVL->getFixedOrderRecurrences().size())
4904 Required |= TailFoldingOpts::Recurrences;
4905
4906 // We call this to discover whether any load/store pointers in the loop have
4907 // negative strides. This will require extra work to reverse the loop
4908 // predicate, which may be expensive.
4911 Required |= TailFoldingOpts::Reverse;
4912 if (Required == TailFoldingOpts::Disabled)
4913 Required |= TailFoldingOpts::Simple;
4914
4916 Required))
4917 return false;
4918
4919 // Don't tail-fold for tight loops where we would be better off interleaving
4920 // with an unpredicated loop.
4921 unsigned NumInsns = 0;
4922 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4923 NumInsns += BB->sizeWithoutDebug();
4924 }
4925
4926 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4927 return NumInsns >= SVETailFoldInsnThreshold;
4928}
4929
4932 StackOffset BaseOffset, bool HasBaseReg,
4933 int64_t Scale, unsigned AddrSpace) const {
4934 // Scaling factors are not free at all.
4935 // Operands | Rt Latency
4936 // -------------------------------------------
4937 // Rt, [Xn, Xm] | 4
4938 // -------------------------------------------
4939 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4940 // Rt, [Xn, Wm, <extend> #imm] |
4942 AM.BaseGV = BaseGV;
4943 AM.BaseOffs = BaseOffset.getFixed();
4944 AM.HasBaseReg = HasBaseReg;
4945 AM.Scale = Scale;
4946 AM.ScalableOffset = BaseOffset.getScalable();
4947 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4948 // Scale represents reg2 * scale, thus account for 1 if
4949 // it is not equal to 0 or 1.
4950 return AM.Scale != 0 && AM.Scale != 1;
4951 return -1;
4952}
4953
4956 // For the binary operators (e.g. or) we need to be more careful than
4957 // selects, here we only transform them if they are already at a natural
4958 // break point in the code - the end of a block with an unconditional
4959 // terminator.
4960 if (I->getOpcode() == Instruction::Or &&
4961 isa<BranchInst>(I->getNextNode()) &&
4962 cast<BranchInst>(I->getNextNode())->isUnconditional())
4963 return true;
4964
4965 if (I->getOpcode() == Instruction::Add ||
4966 I->getOpcode() == Instruction::Sub)
4967 return true;
4968 }
4970}
4971
4973 const TargetTransformInfo::LSRCost &C2) {
4974 // AArch64 specific here is adding the number of instructions to the
4975 // comparison (though not as the first consideration, as some targets do)
4976 // along with changing the priority of the base additions.
4977 // TODO: Maybe a more nuanced tradeoff between instruction count
4978 // and number of registers? To be investigated at a later date.
4979 if (EnableLSRCostOpt)
4980 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4981 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4982 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4983 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4984
4986}
4987
4988static bool isSplatShuffle(Value *V) {
4989 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
4990 return all_equal(Shuf->getShuffleMask());
4991 return false;
4992}
4993
4994/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
4995/// or upper half of the vector elements.
4996static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
4997 bool AllowSplat = false) {
4998 // Scalable types can't be extract shuffle vectors.
4999 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5000 return false;
5001
5002 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5003 auto *FullTy = FullV->getType();
5004 auto *HalfTy = HalfV->getType();
5005 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5006 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5007 };
5008
5009 auto extractHalf = [](Value *FullV, Value *HalfV) {
5010 auto *FullVT = cast<FixedVectorType>(FullV->getType());
5011 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5012 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5013 };
5014
5015 ArrayRef<int> M1, M2;
5016 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5017 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5018 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5019 return false;
5020
5021 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5022 // it is not checked as an extract below.
5023 if (AllowSplat && isSplatShuffle(Op1))
5024 S1Op1 = nullptr;
5025 if (AllowSplat && isSplatShuffle(Op2))
5026 S2Op1 = nullptr;
5027
5028 // Check that the operands are half as wide as the result and we extract
5029 // half of the elements of the input vectors.
5030 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5031 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5032 return false;
5033
5034 // Check the mask extracts either the lower or upper half of vector
5035 // elements.
5036 int M1Start = 0;
5037 int M2Start = 0;
5038 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5039 if ((S1Op1 &&
5040 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5041 (S2Op1 &&
5042 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5043 return false;
5044
5045 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5046 (M2Start != 0 && M2Start != (NumElements / 2)))
5047 return false;
5048 if (S1Op1 && S2Op1 && M1Start != M2Start)
5049 return false;
5050
5051 return true;
5052}
5053
5054/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5055/// of the vector elements.
5056static bool areExtractExts(Value *Ext1, Value *Ext2) {
5057 auto areExtDoubled = [](Instruction *Ext) {
5058 return Ext->getType()->getScalarSizeInBits() ==
5059 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5060 };
5061
5062 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5063 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5064 !areExtDoubled(cast<Instruction>(Ext1)) ||
5065 !areExtDoubled(cast<Instruction>(Ext2)))
5066 return false;
5067
5068 return true;
5069}
5070
5071/// Check if Op could be used with vmull_high_p64 intrinsic.
5073 Value *VectorOperand = nullptr;
5074 ConstantInt *ElementIndex = nullptr;
5075 return match(Op, m_ExtractElt(m_Value(VectorOperand),
5076 m_ConstantInt(ElementIndex))) &&
5077 ElementIndex->getValue() == 1 &&
5078 isa<FixedVectorType>(VectorOperand->getType()) &&
5079 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5080}
5081
5082/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5083static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5085}
5086
5088 // Restrict ourselves to the form CodeGenPrepare typically constructs.
5089 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5090 if (!GEP || GEP->getNumOperands() != 2)
5091 return false;
5092
5093 Value *Base = GEP->getOperand(0);
5094 Value *Offsets = GEP->getOperand(1);
5095
5096 // We only care about scalar_base+vector_offsets.
5097 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5098 return false;
5099
5100 // Sink extends that would allow us to use 32-bit offset vectors.
5101 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5102 auto *OffsetsInst = cast<Instruction>(Offsets);
5103 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5104 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5105 Ops.push_back(&GEP->getOperandUse(1));
5106 }
5107
5108 // Sink the GEP.
5109 return true;
5110}
5111
5112/// We want to sink following cases:
5113/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5114/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5116 if (match(Op, m_VScale()))
5117 return true;
5118 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5120 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5121 return true;
5122 }
5123 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5125 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5126 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5127 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5128 return true;
5129 }
5130 return false;
5131}
5132
5133/// Check if sinking \p I's operands to I's basic block is profitable, because
5134/// the operands can be folded into a target instruction, e.g.
5135/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5137 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5138 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5139 switch (II->getIntrinsicID()) {
5140 case Intrinsic::aarch64_neon_smull:
5141 case Intrinsic::aarch64_neon_umull:
5142 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
5143 /*AllowSplat=*/true)) {
5144 Ops.push_back(&II->getOperandUse(0));
5145 Ops.push_back(&II->getOperandUse(1));
5146 return true;
5147 }
5148 [[fallthrough]];
5149
5150 case Intrinsic::fma:
5151 case Intrinsic::fmuladd:
5152 if (isa<VectorType>(I->getType()) &&
5153 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5154 !ST->hasFullFP16())
5155 return false;
5156 [[fallthrough]];
5157 case Intrinsic::aarch64_neon_sqdmull:
5158 case Intrinsic::aarch64_neon_sqdmulh:
5159 case Intrinsic::aarch64_neon_sqrdmulh:
5160 // Sink splats for index lane variants
5161 if (isSplatShuffle(II->getOperand(0)))
5162 Ops.push_back(&II->getOperandUse(0));
5163 if (isSplatShuffle(II->getOperand(1)))
5164 Ops.push_back(&II->getOperandUse(1));
5165 return !Ops.empty();
5166 case Intrinsic::aarch64_neon_fmlal:
5167 case Intrinsic::aarch64_neon_fmlal2:
5168 case Intrinsic::aarch64_neon_fmlsl:
5169 case Intrinsic::aarch64_neon_fmlsl2:
5170 // Sink splats for index lane variants
5171 if (isSplatShuffle(II->getOperand(1)))
5172 Ops.push_back(&II->getOperandUse(1));
5173 if (isSplatShuffle(II->getOperand(2)))
5174 Ops.push_back(&II->getOperandUse(2));
5175 return !Ops.empty();
5176 case Intrinsic::aarch64_sve_ptest_first:
5177 case Intrinsic::aarch64_sve_ptest_last:
5178 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
5179 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
5180 Ops.push_back(&II->getOperandUse(0));
5181 return !Ops.empty();
5182 case Intrinsic::aarch64_sme_write_horiz:
5183 case Intrinsic::aarch64_sme_write_vert:
5184 case Intrinsic::aarch64_sme_writeq_horiz:
5185 case Intrinsic::aarch64_sme_writeq_vert: {
5186 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
5187 if (!Idx || Idx->getOpcode() != Instruction::Add)
5188 return false;
5189 Ops.push_back(&II->getOperandUse(1));
5190 return true;
5191 }
5192 case Intrinsic::aarch64_sme_read_horiz:
5193 case Intrinsic::aarch64_sme_read_vert:
5194 case Intrinsic::aarch64_sme_readq_horiz:
5195 case Intrinsic::aarch64_sme_readq_vert:
5196 case Intrinsic::aarch64_sme_ld1b_vert:
5197 case Intrinsic::aarch64_sme_ld1h_vert:
5198 case Intrinsic::aarch64_sme_ld1w_vert:
5199 case Intrinsic::aarch64_sme_ld1d_vert:
5200 case Intrinsic::aarch64_sme_ld1q_vert:
5201 case Intrinsic::aarch64_sme_st1b_vert:
5202 case Intrinsic::aarch64_sme_st1h_vert:
5203 case Intrinsic::aarch64_sme_st1w_vert:
5204 case Intrinsic::aarch64_sme_st1d_vert:
5205 case Intrinsic::aarch64_sme_st1q_vert:
5206 case Intrinsic::aarch64_sme_ld1b_horiz:
5207 case Intrinsic::aarch64_sme_ld1h_horiz:
5208 case Intrinsic::aarch64_sme_ld1w_horiz:
5209 case Intrinsic::aarch64_sme_ld1d_horiz:
5210 case Intrinsic::aarch64_sme_ld1q_horiz:
5211 case Intrinsic::aarch64_sme_st1b_horiz:
5212 case Intrinsic::aarch64_sme_st1h_horiz:
5213 case Intrinsic::aarch64_sme_st1w_horiz:
5214 case Intrinsic::aarch64_sme_st1d_horiz:
5215 case Intrinsic::aarch64_sme_st1q_horiz: {
5216 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
5217 if (!Idx || Idx->getOpcode() != Instruction::Add)
5218 return false;
5219 Ops.push_back(&II->getOperandUse(3));
5220 return true;
5221 }
5222 case Intrinsic::aarch64_neon_pmull:
5223 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
5224 return false;
5225 Ops.push_back(&II->getOperandUse(0));
5226 Ops.push_back(&II->getOperandUse(1));
5227 return true;
5228 case Intrinsic::aarch64_neon_pmull64:
5229 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
5230 II->getArgOperand(1)))
5231 return false;
5232 Ops.push_back(&II->getArgOperandUse(0));
5233 Ops.push_back(&II->getArgOperandUse(1));
5234 return true;
5235 case Intrinsic::masked_gather:
5236 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
5237 return false;
5238 Ops.push_back(&II->getArgOperandUse(0));
5239 return true;
5240 case Intrinsic::masked_scatter:
5241 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
5242 return false;
5243 Ops.push_back(&II->getArgOperandUse(1));
5244 return true;
5245 default:
5246 return false;
5247 }
5248 }
5249
5250 // Sink vscales closer to uses for better isel
5251 switch (I->getOpcode()) {
5252 case Instruction::GetElementPtr:
5253 case Instruction::Add:
5254 case Instruction::Sub:
5255 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
5256 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
5257 Ops.push_back(&I->getOperandUse(Op));
5258 return true;
5259 }
5260 }
5261 break;
5262 default:
5263 break;
5264 }
5265
5266 if (!I->getType()->isVectorTy())
5267 return false;
5268
5269 switch (I->getOpcode()) {
5270 case Instruction::Sub:
5271 case Instruction::Add: {
5272 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
5273 return false;
5274
5275 // If the exts' operands extract either the lower or upper elements, we
5276 // can sink them too.
5277 auto Ext1 = cast<Instruction>(I->getOperand(0));
5278 auto Ext2 = cast<Instruction>(I->getOperand(1));
5279 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
5280 Ops.push_back(&Ext1->getOperandUse(0));
5281 Ops.push_back(&Ext2->getOperandUse(0));
5282 }
5283
5284 Ops.push_back(&I->getOperandUse(0));
5285 Ops.push_back(&I->getOperandUse(1));
5286
5287 return true;
5288 }
5289 case Instruction::Or: {
5290 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
5291 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
5292 if (ST->hasNEON()) {
5293 Instruction *OtherAnd, *IA, *IB;
5294 Value *MaskValue;
5295 // MainAnd refers to And instruction that has 'Not' as one of its operands
5296 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
5297 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
5298 m_Instruction(IA)))))) {
5299 if (match(OtherAnd,
5300 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
5301 Instruction *MainAnd = I->getOperand(0) == OtherAnd
5302 ? cast<Instruction>(I->getOperand(1))
5303 : cast<Instruction>(I->getOperand(0));
5304
5305 // Both Ands should be in same basic block as Or
5306 if (I->getParent() != MainAnd->getParent() ||
5307 I->getParent() != OtherAnd->getParent())
5308 return false;
5309
5310 // Non-mask operands of both Ands should also be in same basic block
5311 if (I->getParent() != IA->getParent() ||
5312 I->getParent() != IB->getParent())
5313 return false;
5314
5315 Ops.push_back(
5316 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
5317 Ops.push_back(&I->getOperandUse(0));
5318 Ops.push_back(&I->getOperandUse(1));
5319
5320 return true;
5321 }
5322 }
5323 }
5324
5325 return false;
5326 }
5327 case Instruction::Mul: {
5328 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5329 auto *Ty = cast<VectorType>(V->getType());
5330 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5331 if (Ty->isScalableTy())
5332 return false;
5333
5334 // Indexed variants of Mul exist for i16 and i32 element types only.
5335 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5336 };
5337
5338 int NumZExts = 0, NumSExts = 0;
5339 for (auto &Op : I->operands()) {
5340 // Make sure we are not already sinking this operand
5341 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
5342 continue;
5343
5344 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5345 auto *Ext = cast<Instruction>(Op);
5346 auto *ExtOp = Ext->getOperand(0);
5347 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5348 Ops.push_back(&Ext->getOperandUse(0));
5349 Ops.push_back(&Op);
5350
5351 if (isa<SExtInst>(Ext))
5352 NumSExts++;
5353 else
5354 NumZExts++;
5355
5356 continue;
5357 }
5358
5359 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5360 if (!Shuffle)
5361 continue;
5362
5363 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
5364 // operand and the s/zext can help create indexed s/umull. This is
5365 // especially useful to prevent i64 mul being scalarized.
5366 if (isSplatShuffle(Shuffle) &&
5367 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
5368 Ops.push_back(&Shuffle->getOperandUse(0));
5369 Ops.push_back(&Op);
5370 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
5371 NumSExts++;
5372 else
5373 NumZExts++;
5374 continue;
5375 }
5376
5377 Value *ShuffleOperand = Shuffle->getOperand(0);
5378 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
5379 if (!Insert)
5380 continue;
5381
5382 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
5383 if (!OperandInstr)
5384 continue;
5385
5386 ConstantInt *ElementConstant =
5387 dyn_cast<ConstantInt>(Insert->getOperand(2));
5388 // Check that the insertelement is inserting into element 0
5389 if (!ElementConstant || !ElementConstant->isZero())
5390 continue;
5391
5392 unsigned Opcode = OperandInstr->getOpcode();
5393 if (Opcode == Instruction::SExt)
5394 NumSExts++;
5395 else if (Opcode == Instruction::ZExt)
5396 NumZExts++;
5397 else {
5398 // If we find that the top bits are known 0, then we can sink and allow
5399 // the backend to generate a umull.
5400 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
5401 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
5402 const DataLayout &DL = I->getDataLayout();
5403 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
5404 continue;
5405 NumZExts++;
5406 }
5407
5408 Ops.push_back(&Insert->getOperandUse(1));
5409 Ops.push_back(&Shuffle->getOperandUse(0));
5410 Ops.push_back(&Op);
5411 }
5412
5413 // It is profitable to sink if we found two of the same type of extends.
5414 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5415 return true;
5416
5417 // Otherwise, see if we should sink splats for indexed variants.
5418 if (!ShouldSinkSplatForIndexedVariant(I))
5419 return false;
5420
5421 Ops.clear();
5422 if (isSplatShuffle(I->getOperand(0)))
5423 Ops.push_back(&I->getOperandUse(0));
5424 if (isSplatShuffle(I->getOperand(1)))
5425 Ops.push_back(&I->getOperandUse(1));
5426
5427 return !Ops.empty();
5428 }
5429 case Instruction::FMul: {
5430 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5431 if (I->getType()->isScalableTy())
5432 return false;
5433
5434 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5435 !ST->hasFullFP16())
5436 return false;
5437
5438 // Sink splats for index lane variants
5439 if (isSplatShuffle(I->getOperand(0)))
5440 Ops.push_back(&I->getOperandUse(0));
5441 if (isSplatShuffle(I->getOperand(1)))
5442 Ops.push_back(&I->getOperandUse(1));
5443 return !Ops.empty();
5444 }
5445 default:
5446 return false;
5447 }
5448 return false;
5449}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, bool hasInactiveVector)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getEpilogueVectorizationMinVF() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
unsigned getEpilogueVectorizationMinVF() const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
unsigned countLeadingOnes() const
Definition: APInt.h:1603
void negate()
Negate this APInt in place.
Definition: APInt.h:1450
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:596
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:280
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:668
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:896
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:780
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:932
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:353
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:218
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
unsigned arg_size() const
Definition: InstrTypes.h:1292
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isIntPredicate() const
Definition: InstrTypes.h:781
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1672
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2503
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2554
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1060
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2491
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:556
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1152
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:546
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1048
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:541
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1889
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2236
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1813
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2525
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:566
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:551
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1689
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2227
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:48
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:394
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:418
BuilderTy & Builder
Definition: InstCombiner.h:61
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:686
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:651
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
This instruction constructs a fixed permutation of two input vectors.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:218
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:931
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1065
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition: VE.h:376
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55