LLVM 20.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
28#include <algorithm>
29#include <optional>
30using namespace llvm;
31using namespace llvm::PatternMatch;
32
33#define DEBUG_TYPE "aarch64tti"
34
35static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
36 cl::init(true), cl::Hidden);
37
39 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
40
41static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
43
44static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
45 cl::init(10), cl::Hidden);
46
47static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
48 cl::init(15), cl::Hidden);
49
51 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
53
55 "call-penalty-sm-change", cl::init(5), cl::Hidden,
57 "Penalty of calling a function that requires a change to PSTATE.SM"));
58
60 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
61 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
62
63static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
64 cl::init(true), cl::Hidden);
65
66static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
67 cl::init(true), cl::Hidden);
68
69// A complete guess as to a reasonable cost.
71 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
72 cl::desc("The cost of a histcnt instruction"));
73
75 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
76 cl::desc("The number of instructions to search for a redundant dmb"));
77
78namespace {
79class TailFoldingOption {
80 // These bitfields will only ever be set to something non-zero in operator=,
81 // when setting the -sve-tail-folding option. This option should always be of
82 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
83 // InitialBits is one of (disabled|all|simple). EnableBits represents
84 // additional flags we're enabling, and DisableBits for those flags we're
85 // disabling. The default flag is tracked in the variable NeedsDefault, since
86 // at the time of setting the option we may not know what the default value
87 // for the CPU is.
88 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
89 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
90 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
91
92 // This value needs to be initialised to true in case the user does not
93 // explicitly set the -sve-tail-folding option.
94 bool NeedsDefault = true;
95
96 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
97
98 void setNeedsDefault(bool V) { NeedsDefault = V; }
99
100 void setEnableBit(TailFoldingOpts Bit) {
101 EnableBits |= Bit;
102 DisableBits &= ~Bit;
103 }
104
105 void setDisableBit(TailFoldingOpts Bit) {
106 EnableBits &= ~Bit;
107 DisableBits |= Bit;
108 }
109
110 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
111 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
112
113 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
114 "Initial bits should only include one of "
115 "(disabled|all|simple|default)");
116 Bits = NeedsDefault ? DefaultBits : InitialBits;
117 Bits |= EnableBits;
118 Bits &= ~DisableBits;
119
120 return Bits;
121 }
122
123 void reportError(std::string Opt) {
124 errs() << "invalid argument '" << Opt
125 << "' to -sve-tail-folding=; the option should be of the form\n"
126 " (disabled|all|default|simple)[+(reductions|recurrences"
127 "|reverse|noreductions|norecurrences|noreverse)]\n";
128 report_fatal_error("Unrecognised tail-folding option");
129 }
130
131public:
132
133 void operator=(const std::string &Val) {
134 // If the user explicitly sets -sve-tail-folding= then treat as an error.
135 if (Val.empty()) {
136 reportError("");
137 return;
138 }
139
140 // Since the user is explicitly setting the option we don't automatically
141 // need the default unless they require it.
142 setNeedsDefault(false);
143
144 SmallVector<StringRef, 4> TailFoldTypes;
145 StringRef(Val).split(TailFoldTypes, '+', -1, false);
146
147 unsigned StartIdx = 1;
148 if (TailFoldTypes[0] == "disabled")
149 setInitialBits(TailFoldingOpts::Disabled);
150 else if (TailFoldTypes[0] == "all")
151 setInitialBits(TailFoldingOpts::All);
152 else if (TailFoldTypes[0] == "default")
153 setNeedsDefault(true);
154 else if (TailFoldTypes[0] == "simple")
155 setInitialBits(TailFoldingOpts::Simple);
156 else {
157 StartIdx = 0;
158 setInitialBits(TailFoldingOpts::Disabled);
159 }
160
161 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
162 if (TailFoldTypes[I] == "reductions")
163 setEnableBit(TailFoldingOpts::Reductions);
164 else if (TailFoldTypes[I] == "recurrences")
165 setEnableBit(TailFoldingOpts::Recurrences);
166 else if (TailFoldTypes[I] == "reverse")
167 setEnableBit(TailFoldingOpts::Reverse);
168 else if (TailFoldTypes[I] == "noreductions")
169 setDisableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "norecurrences")
171 setDisableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "noreverse")
173 setDisableBit(TailFoldingOpts::Reverse);
174 else
175 reportError(Val);
176 }
177 }
178
179 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
180 return (getBits(DefaultBits) & Required) == Required;
181 }
182};
183} // namespace
184
185TailFoldingOption TailFoldingOptionLoc;
186
188 "sve-tail-folding",
189 cl::desc(
190 "Control the use of vectorisation using tail-folding for SVE where the"
191 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
192 "\ndisabled (Initial) No loop types will vectorize using "
193 "tail-folding"
194 "\ndefault (Initial) Uses the default tail-folding settings for "
195 "the target CPU"
196 "\nall (Initial) All legal loop types will vectorize using "
197 "tail-folding"
198 "\nsimple (Initial) Use tail-folding for simple loops (not "
199 "reductions or recurrences)"
200 "\nreductions Use tail-folding for loops containing reductions"
201 "\nnoreductions Inverse of above"
202 "\nrecurrences Use tail-folding for loops containing fixed order "
203 "recurrences"
204 "\nnorecurrences Inverse of above"
205 "\nreverse Use tail-folding for loops requiring reversed "
206 "predicates"
207 "\nnoreverse Inverse of above"),
209
210// Experimental option that will only be fully functional when the
211// code-generator is changed to use SVE instead of NEON for all fixed-width
212// operations.
214 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
215
216// Experimental option that will only be fully functional when the cost-model
217// and code-generator have been changed to avoid using scalable vector
218// instructions that are not legal in streaming SVE mode.
220 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222static bool isSMEABIRoutineCall(const CallInst &CI) {
223 const auto *F = CI.getCalledFunction();
224 return F && StringSwitch<bool>(F->getName())
225 .Case("__arm_sme_state", true)
226 .Case("__arm_tpidr2_save", true)
227 .Case("__arm_tpidr2_restore", true)
228 .Case("__arm_za_disable", true)
229 .Default(false);
230}
231
232/// Returns true if the function has explicit operations that can only be
233/// lowered using incompatible instructions for the selected mode. This also
234/// returns true if the function F may use or modify ZA state.
236 for (const BasicBlock &BB : *F) {
237 for (const Instruction &I : BB) {
238 // Be conservative for now and assume that any call to inline asm or to
239 // intrinsics could could result in non-streaming ops (e.g. calls to
240 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
241 // all native LLVM instructions can be lowered to compatible instructions.
242 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
243 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
244 isSMEABIRoutineCall(cast<CallInst>(I))))
245 return true;
246 }
247 }
248 return false;
249}
250
252 const Function *Callee) const {
253 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
254
255 // When inlining, we should consider the body of the function, not the
256 // interface.
257 if (CalleeAttrs.hasStreamingBody()) {
258 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
259 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
260 }
261
262 if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
263 return false;
264
265 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
266 CallerAttrs.requiresSMChange(CalleeAttrs) ||
267 CallerAttrs.requiresPreservingZT0(CalleeAttrs) ||
268 CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) {
269 if (hasPossibleIncompatibleOps(Callee))
270 return false;
271 }
272
273 return BaseT::areInlineCompatible(Caller, Callee);
274}
275
277 const Function *Caller, const Function *Callee,
278 const ArrayRef<Type *> &Types) const {
279 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
280 return false;
281
282 // We need to ensure that argument promotion does not attempt to promote
283 // pointers to fixed-length vector types larger than 128 bits like
284 // <8 x float> (and pointers to aggregate types which have such fixed-length
285 // vector type members) into the values of the pointees. Such vector types
286 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
287 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
288 // types can be safely treated as 128-bit NEON types and they cannot be
289 // distinguished in IR.
290 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
291 auto FVTy = dyn_cast<FixedVectorType>(Ty);
292 return FVTy &&
293 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
294 }))
295 return false;
296
297 return true;
298}
299
300unsigned
302 unsigned DefaultCallPenalty) const {
303 // This function calculates a penalty for executing Call in F.
304 //
305 // There are two ways this function can be called:
306 // (1) F:
307 // call from F -> G (the call here is Call)
308 //
309 // For (1), Call.getCaller() == F, so it will always return a high cost if
310 // a streaming-mode change is required (thus promoting the need to inline the
311 // function)
312 //
313 // (2) F:
314 // call from F -> G (the call here is not Call)
315 // G:
316 // call from G -> H (the call here is Call)
317 //
318 // For (2), if after inlining the body of G into F the call to H requires a
319 // streaming-mode change, and the call to G from F would also require a
320 // streaming-mode change, then there is benefit to do the streaming-mode
321 // change only once and avoid inlining of G into F.
322 SMEAttrs FAttrs(*F);
323 SMEAttrs CalleeAttrs(Call);
324 if (FAttrs.requiresSMChange(CalleeAttrs)) {
325 if (F == Call.getCaller()) // (1)
326 return CallPenaltyChangeSM * DefaultCallPenalty;
327 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
328 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
329 }
330
331 return DefaultCallPenalty;
332}
333
338 ST->isNeonAvailable());
339}
340
341/// Calculate the cost of materializing a 64-bit value. This helper
342/// method might only calculate a fraction of a larger immediate. Therefore it
343/// is valid to return a cost of ZERO.
345 // Check if the immediate can be encoded within an instruction.
346 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
347 return 0;
348
349 if (Val < 0)
350 Val = ~Val;
351
352 // Calculate how many moves we will need to materialize this constant.
355 return Insn.size();
356}
357
358/// Calculate the cost of materializing the given constant.
361 assert(Ty->isIntegerTy());
362
363 unsigned BitSize = Ty->getPrimitiveSizeInBits();
364 if (BitSize == 0)
365 return ~0U;
366
367 // Sign-extend all constants to a multiple of 64-bit.
368 APInt ImmVal = Imm;
369 if (BitSize & 0x3f)
370 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
371
372 // Split the constant into 64-bit chunks and calculate the cost for each
373 // chunk.
375 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
376 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
377 int64_t Val = Tmp.getSExtValue();
378 Cost += getIntImmCost(Val);
379 }
380 // We need at least one instruction to materialze the constant.
381 return std::max<InstructionCost>(1, Cost);
382}
383
385 const APInt &Imm, Type *Ty,
387 Instruction *Inst) {
388 assert(Ty->isIntegerTy());
389
390 unsigned BitSize = Ty->getPrimitiveSizeInBits();
391 // There is no cost model for constants with a bit size of 0. Return TCC_Free
392 // here, so that constant hoisting will ignore this constant.
393 if (BitSize == 0)
394 return TTI::TCC_Free;
395
396 unsigned ImmIdx = ~0U;
397 switch (Opcode) {
398 default:
399 return TTI::TCC_Free;
400 case Instruction::GetElementPtr:
401 // Always hoist the base address of a GetElementPtr.
402 if (Idx == 0)
403 return 2 * TTI::TCC_Basic;
404 return TTI::TCC_Free;
405 case Instruction::Store:
406 ImmIdx = 0;
407 break;
408 case Instruction::Add:
409 case Instruction::Sub:
410 case Instruction::Mul:
411 case Instruction::UDiv:
412 case Instruction::SDiv:
413 case Instruction::URem:
414 case Instruction::SRem:
415 case Instruction::And:
416 case Instruction::Or:
417 case Instruction::Xor:
418 case Instruction::ICmp:
419 ImmIdx = 1;
420 break;
421 // Always return TCC_Free for the shift value of a shift instruction.
422 case Instruction::Shl:
423 case Instruction::LShr:
424 case Instruction::AShr:
425 if (Idx == 1)
426 return TTI::TCC_Free;
427 break;
428 case Instruction::Trunc:
429 case Instruction::ZExt:
430 case Instruction::SExt:
431 case Instruction::IntToPtr:
432 case Instruction::PtrToInt:
433 case Instruction::BitCast:
434 case Instruction::PHI:
435 case Instruction::Call:
436 case Instruction::Select:
437 case Instruction::Ret:
438 case Instruction::Load:
439 break;
440 }
441
442 if (Idx == ImmIdx) {
443 int NumConstants = (BitSize + 63) / 64;
445 return (Cost <= NumConstants * TTI::TCC_Basic)
446 ? static_cast<int>(TTI::TCC_Free)
447 : Cost;
448 }
450}
451
454 const APInt &Imm, Type *Ty,
456 assert(Ty->isIntegerTy());
457
458 unsigned BitSize = Ty->getPrimitiveSizeInBits();
459 // There is no cost model for constants with a bit size of 0. Return TCC_Free
460 // here, so that constant hoisting will ignore this constant.
461 if (BitSize == 0)
462 return TTI::TCC_Free;
463
464 // Most (all?) AArch64 intrinsics do not support folding immediates into the
465 // selected instruction, so we compute the materialization cost for the
466 // immediate directly.
467 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
469
470 switch (IID) {
471 default:
472 return TTI::TCC_Free;
473 case Intrinsic::sadd_with_overflow:
474 case Intrinsic::uadd_with_overflow:
475 case Intrinsic::ssub_with_overflow:
476 case Intrinsic::usub_with_overflow:
477 case Intrinsic::smul_with_overflow:
478 case Intrinsic::umul_with_overflow:
479 if (Idx == 1) {
480 int NumConstants = (BitSize + 63) / 64;
482 return (Cost <= NumConstants * TTI::TCC_Basic)
483 ? static_cast<int>(TTI::TCC_Free)
484 : Cost;
485 }
486 break;
487 case Intrinsic::experimental_stackmap:
488 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
489 return TTI::TCC_Free;
490 break;
491 case Intrinsic::experimental_patchpoint_void:
492 case Intrinsic::experimental_patchpoint:
493 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
494 return TTI::TCC_Free;
495 break;
496 case Intrinsic::experimental_gc_statepoint:
497 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
498 return TTI::TCC_Free;
499 break;
500 }
502}
503
506 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
507 if (TyWidth == 32 || TyWidth == 64)
509 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
510 return TTI::PSK_Software;
511}
512
513static bool isUnpackedVectorVT(EVT VecVT) {
514 return VecVT.isScalableVector() &&
516}
517
519 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
520 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
521 unsigned TotalHistCnts = 1;
522
523 unsigned EltSize = EltTy->getScalarSizeInBits();
524 // Only allow (up to 64b) integers or pointers
525 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
527
528 // FIXME: We should be able to generate histcnt for fixed-length vectors
529 // using ptrue with a specific VL.
530 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
531 unsigned EC = VTy->getElementCount().getKnownMinValue();
532 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
534
535 // HistCnt only supports 32b and 64b element types
536 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
537
538 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
540
541 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
542 TotalHistCnts = EC / NaturalVectorWidth;
543 }
544
545 return InstructionCost(BaseHistCntCost * TotalHistCnts);
546}
547
551 // The code-generator is currently not able to handle scalable vectors
552 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
553 // it. This change will be removed when code-generation for these types is
554 // sufficiently reliable.
555 auto *RetTy = ICA.getReturnType();
556 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
557 if (VTy->getElementCount() == ElementCount::getScalable(1))
559
560 switch (ICA.getID()) {
561 case Intrinsic::experimental_vector_histogram_add:
562 if (!ST->hasSVE2())
564 return getHistogramCost(ICA);
565 case Intrinsic::umin:
566 case Intrinsic::umax:
567 case Intrinsic::smin:
568 case Intrinsic::smax: {
569 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
570 MVT::v8i16, MVT::v2i32, MVT::v4i32,
571 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
572 MVT::nxv2i64};
574 // v2i64 types get converted to cmp+bif hence the cost of 2
575 if (LT.second == MVT::v2i64)
576 return LT.first * 2;
577 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
578 return LT.first;
579 break;
580 }
581 case Intrinsic::sadd_sat:
582 case Intrinsic::ssub_sat:
583 case Intrinsic::uadd_sat:
584 case Intrinsic::usub_sat: {
585 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
586 MVT::v8i16, MVT::v2i32, MVT::v4i32,
587 MVT::v2i64};
589 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
590 // need to extend the type, as it uses shr(qadd(shl, shl)).
591 unsigned Instrs =
592 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
593 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
594 return LT.first * Instrs;
595 break;
596 }
597 case Intrinsic::abs: {
598 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
599 MVT::v8i16, MVT::v2i32, MVT::v4i32,
600 MVT::v2i64};
602 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
603 return LT.first;
604 break;
605 }
606 case Intrinsic::bswap: {
607 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
608 MVT::v4i32, MVT::v2i64};
610 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
611 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
612 return LT.first;
613 break;
614 }
615 case Intrinsic::stepvector: {
616 InstructionCost Cost = 1; // Cost of the `index' instruction
618 // Legalisation of illegal vectors involves an `index' instruction plus
619 // (LT.first - 1) vector adds.
620 if (LT.first > 1) {
621 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
622 InstructionCost AddCost =
623 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
624 Cost += AddCost * (LT.first - 1);
625 }
626 return Cost;
627 }
628 case Intrinsic::vector_extract:
629 case Intrinsic::vector_insert: {
630 // If both the vector and subvector types are legal types and the index
631 // is 0, then this should be a no-op or simple operation; return a
632 // relatively low cost.
633
634 // If arguments aren't actually supplied, then we cannot determine the
635 // value of the index. We also want to skip predicate types.
636 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
638 break;
639
640 LLVMContext &C = RetTy->getContext();
641 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
642 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
643 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
644 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
645 // Skip this if either the vector or subvector types are unpacked
646 // SVE types; they may get lowered to stack stores and loads.
647 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
648 break;
649
651 getTLI()->getTypeConversion(C, SubVecVT);
653 getTLI()->getTypeConversion(C, VecVT);
654 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
655 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
656 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
657 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
658 return TTI::TCC_Free;
659 break;
660 }
661 case Intrinsic::bitreverse: {
662 static const CostTblEntry BitreverseTbl[] = {
663 {Intrinsic::bitreverse, MVT::i32, 1},
664 {Intrinsic::bitreverse, MVT::i64, 1},
665 {Intrinsic::bitreverse, MVT::v8i8, 1},
666 {Intrinsic::bitreverse, MVT::v16i8, 1},
667 {Intrinsic::bitreverse, MVT::v4i16, 2},
668 {Intrinsic::bitreverse, MVT::v8i16, 2},
669 {Intrinsic::bitreverse, MVT::v2i32, 2},
670 {Intrinsic::bitreverse, MVT::v4i32, 2},
671 {Intrinsic::bitreverse, MVT::v1i64, 2},
672 {Intrinsic::bitreverse, MVT::v2i64, 2},
673 };
674 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
675 const auto *Entry =
676 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
677 if (Entry) {
678 // Cost Model is using the legal type(i32) that i8 and i16 will be
679 // converted to +1 so that we match the actual lowering cost
680 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
681 TLI->getValueType(DL, RetTy, true) == MVT::i16)
682 return LegalisationCost.first * Entry->Cost + 1;
683
684 return LegalisationCost.first * Entry->Cost;
685 }
686 break;
687 }
688 case Intrinsic::ctpop: {
689 if (!ST->hasNEON()) {
690 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
691 return getTypeLegalizationCost(RetTy).first * 12;
692 }
693 static const CostTblEntry CtpopCostTbl[] = {
694 {ISD::CTPOP, MVT::v2i64, 4},
695 {ISD::CTPOP, MVT::v4i32, 3},
696 {ISD::CTPOP, MVT::v8i16, 2},
697 {ISD::CTPOP, MVT::v16i8, 1},
698 {ISD::CTPOP, MVT::i64, 4},
699 {ISD::CTPOP, MVT::v2i32, 3},
700 {ISD::CTPOP, MVT::v4i16, 2},
701 {ISD::CTPOP, MVT::v8i8, 1},
702 {ISD::CTPOP, MVT::i32, 5},
703 };
705 MVT MTy = LT.second;
706 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
707 // Extra cost of +1 when illegal vector types are legalized by promoting
708 // the integer type.
709 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
710 RetTy->getScalarSizeInBits()
711 ? 1
712 : 0;
713 return LT.first * Entry->Cost + ExtraCost;
714 }
715 break;
716 }
717 case Intrinsic::sadd_with_overflow:
718 case Intrinsic::uadd_with_overflow:
719 case Intrinsic::ssub_with_overflow:
720 case Intrinsic::usub_with_overflow:
721 case Intrinsic::smul_with_overflow:
722 case Intrinsic::umul_with_overflow: {
723 static const CostTblEntry WithOverflowCostTbl[] = {
724 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
725 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
726 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
727 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
728 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
729 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
730 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
731 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
732 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
733 {Intrinsic::usub_with_overflow, MVT::i8, 3},
734 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
735 {Intrinsic::usub_with_overflow, MVT::i16, 3},
736 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
737 {Intrinsic::usub_with_overflow, MVT::i32, 1},
738 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
739 {Intrinsic::usub_with_overflow, MVT::i64, 1},
740 {Intrinsic::smul_with_overflow, MVT::i8, 5},
741 {Intrinsic::umul_with_overflow, MVT::i8, 4},
742 {Intrinsic::smul_with_overflow, MVT::i16, 5},
743 {Intrinsic::umul_with_overflow, MVT::i16, 4},
744 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
745 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
746 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
747 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
748 };
749 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
750 if (MTy.isSimple())
751 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
752 MTy.getSimpleVT()))
753 return Entry->Cost;
754 break;
755 }
756 case Intrinsic::fptosi_sat:
757 case Intrinsic::fptoui_sat: {
758 if (ICA.getArgTypes().empty())
759 break;
760 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
761 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
762 EVT MTy = TLI->getValueType(DL, RetTy);
763 // Check for the legal types, which are where the size of the input and the
764 // output are the same, or we are using cvt f64->i32 or f32->i64.
765 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
766 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
767 LT.second == MVT::v2f64)) {
768 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
769 (LT.second == MVT::f64 && MTy == MVT::i32) ||
770 (LT.second == MVT::f32 && MTy == MVT::i64)))
771 return LT.first;
772 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
773 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
774 MTy.getScalarSizeInBits() == 64)
775 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
776 }
777 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
778 // f32.
779 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
780 return LT.first + getIntrinsicInstrCost(
781 {ICA.getID(),
782 RetTy,
783 {ICA.getArgTypes()[0]->getWithNewType(
784 Type::getFloatTy(RetTy->getContext()))}},
785 CostKind);
786 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
787 (LT.second == MVT::f16 && MTy == MVT::i64) ||
788 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
789 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
790 return LT.first;
791 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
792 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
793 MTy.getScalarSizeInBits() == 32)
794 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
795 // Extending vector types v8f16->v8i32. These current scalarize but the
796 // codegen could be better.
797 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
798 MTy.getScalarSizeInBits() == 64)
799 return MTy.getVectorNumElements() * 3;
800
801 // If we can we use a legal convert followed by a min+max
802 if ((LT.second.getScalarType() == MVT::f32 ||
803 LT.second.getScalarType() == MVT::f64 ||
804 LT.second.getScalarType() == MVT::f16) &&
805 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
806 Type *LegalTy =
807 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
808 if (LT.second.isVector())
809 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
811 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
812 LegalTy, {LegalTy, LegalTy});
814 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
815 LegalTy, {LegalTy, LegalTy});
817 return LT.first * Cost +
818 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
819 : 1);
820 }
821 // Otherwise we need to follow the default expansion that clamps the value
822 // using a float min/max with a fcmp+sel for nan handling when signed.
823 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
824 RetTy = RetTy->getScalarType();
825 if (LT.second.isVector()) {
826 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
827 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
828 }
829 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
831 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
833 Cost +=
834 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
836 if (IsSigned) {
837 Type *CondTy = RetTy->getWithNewBitWidth(1);
838 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
840 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
842 }
843 return LT.first * Cost;
844 }
845 case Intrinsic::fshl:
846 case Intrinsic::fshr: {
847 if (ICA.getArgs().empty())
848 break;
849
850 // TODO: Add handling for fshl where third argument is not a constant.
851 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
852 if (!OpInfoZ.isConstant())
853 break;
854
855 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
856 if (OpInfoZ.isUniform()) {
857 // FIXME: The costs could be lower if the codegen is better.
858 static const CostTblEntry FshlTbl[] = {
859 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
860 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
861 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
862 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
863 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
864 // to avoid having to duplicate the costs.
865 const auto *Entry =
866 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
867 if (Entry)
868 return LegalisationCost.first * Entry->Cost;
869 }
870
871 auto TyL = getTypeLegalizationCost(RetTy);
872 if (!RetTy->isIntegerTy())
873 break;
874
875 // Estimate cost manually, as types like i8 and i16 will get promoted to
876 // i32 and CostTableLookup will ignore the extra conversion cost.
877 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
878 RetTy->getScalarSizeInBits() < 64) ||
879 (RetTy->getScalarSizeInBits() % 64 != 0);
880 unsigned ExtraCost = HigherCost ? 1 : 0;
881 if (RetTy->getScalarSizeInBits() == 32 ||
882 RetTy->getScalarSizeInBits() == 64)
883 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
884 // extr instruction.
885 else if (HigherCost)
886 ExtraCost = 1;
887 else
888 break;
889 return TyL.first + ExtraCost;
890 }
891 case Intrinsic::get_active_lane_mask: {
892 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
893 if (RetTy) {
894 EVT RetVT = getTLI()->getValueType(DL, RetTy);
895 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
896 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
897 !getTLI()->isTypeLegal(RetVT)) {
898 // We don't have enough context at this point to determine if the mask
899 // is going to be kept live after the block, which will force the vXi1
900 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
901 // For now, we just assume the vectorizer created this intrinsic and
902 // the result will be the input for a PHI. In this case the cost will
903 // be extremely high for fixed-width vectors.
904 // NOTE: getScalarizationOverhead returns a cost that's far too
905 // pessimistic for the actual generated codegen. In reality there are
906 // two instructions generated per lane.
907 return RetTy->getNumElements() * 2;
908 }
909 }
910 break;
911 }
912 case Intrinsic::experimental_vector_match: {
913 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
914 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
915 unsigned SearchSize = NeedleTy->getNumElements();
916 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
917 // Base cost for MATCH instructions. At least on the Neoverse V2 and
918 // Neoverse V3, these are cheap operations with the same latency as a
919 // vector ADD. In most cases, however, we also need to do an extra DUP.
920 // For fixed-length vectors we currently need an extra five--six
921 // instructions besides the MATCH.
923 if (isa<FixedVectorType>(RetTy))
924 Cost += 10;
925 return Cost;
926 }
927 break;
928 }
929 default:
930 break;
931 }
933}
934
935/// The function will remove redundant reinterprets casting in the presence
936/// of the control flow
937static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
938 IntrinsicInst &II) {
940 auto RequiredType = II.getType();
941
942 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
943 assert(PN && "Expected Phi Node!");
944
945 // Don't create a new Phi unless we can remove the old one.
946 if (!PN->hasOneUse())
947 return std::nullopt;
948
949 for (Value *IncValPhi : PN->incoming_values()) {
950 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
951 if (!Reinterpret ||
952 Reinterpret->getIntrinsicID() !=
953 Intrinsic::aarch64_sve_convert_to_svbool ||
954 RequiredType != Reinterpret->getArgOperand(0)->getType())
955 return std::nullopt;
956 }
957
958 // Create the new Phi
959 IC.Builder.SetInsertPoint(PN);
960 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
961 Worklist.push_back(PN);
962
963 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
964 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
965 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
966 Worklist.push_back(Reinterpret);
967 }
968
969 // Cleanup Phi Node and reinterprets
970 return IC.replaceInstUsesWith(II, NPN);
971}
972
973// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
974// => (binop (pred) (from_svbool _) (from_svbool _))
975//
976// The above transformation eliminates a `to_svbool` in the predicate
977// operand of bitwise operation `binop` by narrowing the vector width of
978// the operation. For example, it would convert a `<vscale x 16 x i1>
979// and` into a `<vscale x 4 x i1> and`. This is profitable because
980// to_svbool must zero the new lanes during widening, whereas
981// from_svbool is free.
982static std::optional<Instruction *>
984 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
985 if (!BinOp)
986 return std::nullopt;
987
988 auto IntrinsicID = BinOp->getIntrinsicID();
989 switch (IntrinsicID) {
990 case Intrinsic::aarch64_sve_and_z:
991 case Intrinsic::aarch64_sve_bic_z:
992 case Intrinsic::aarch64_sve_eor_z:
993 case Intrinsic::aarch64_sve_nand_z:
994 case Intrinsic::aarch64_sve_nor_z:
995 case Intrinsic::aarch64_sve_orn_z:
996 case Intrinsic::aarch64_sve_orr_z:
997 break;
998 default:
999 return std::nullopt;
1000 }
1001
1002 auto BinOpPred = BinOp->getOperand(0);
1003 auto BinOpOp1 = BinOp->getOperand(1);
1004 auto BinOpOp2 = BinOp->getOperand(2);
1005
1006 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
1007 if (!PredIntr ||
1008 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1009 return std::nullopt;
1010
1011 auto PredOp = PredIntr->getOperand(0);
1012 auto PredOpTy = cast<VectorType>(PredOp->getType());
1013 if (PredOpTy != II.getType())
1014 return std::nullopt;
1015
1016 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1017 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1018 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
1019 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1020 if (BinOpOp1 == BinOpOp2)
1021 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
1022 else
1023 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1024 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1025
1026 auto NarrowedBinOp =
1027 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1028 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1029}
1030
1031static std::optional<Instruction *>
1033 // If the reinterpret instruction operand is a PHI Node
1034 if (isa<PHINode>(II.getArgOperand(0)))
1035 return processPhiNode(IC, II);
1036
1037 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1038 return BinOpCombine;
1039
1040 // Ignore converts to/from svcount_t.
1041 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1042 isa<TargetExtType>(II.getType()))
1043 return std::nullopt;
1044
1045 SmallVector<Instruction *, 32> CandidatesForRemoval;
1046 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1047
1048 const auto *IVTy = cast<VectorType>(II.getType());
1049
1050 // Walk the chain of conversions.
1051 while (Cursor) {
1052 // If the type of the cursor has fewer lanes than the final result, zeroing
1053 // must take place, which breaks the equivalence chain.
1054 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1055 if (CursorVTy->getElementCount().getKnownMinValue() <
1056 IVTy->getElementCount().getKnownMinValue())
1057 break;
1058
1059 // If the cursor has the same type as I, it is a viable replacement.
1060 if (Cursor->getType() == IVTy)
1061 EarliestReplacement = Cursor;
1062
1063 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1064
1065 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1066 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1067 Intrinsic::aarch64_sve_convert_to_svbool ||
1068 IntrinsicCursor->getIntrinsicID() ==
1069 Intrinsic::aarch64_sve_convert_from_svbool))
1070 break;
1071
1072 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1073 Cursor = IntrinsicCursor->getOperand(0);
1074 }
1075
1076 // If no viable replacement in the conversion chain was found, there is
1077 // nothing to do.
1078 if (!EarliestReplacement)
1079 return std::nullopt;
1080
1081 return IC.replaceInstUsesWith(II, EarliestReplacement);
1082}
1083
1084static bool isAllActivePredicate(Value *Pred) {
1085 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1086 Value *UncastedPred;
1087 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1088 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1089 m_Value(UncastedPred)))))
1090 // If the predicate has the same or less lanes than the uncasted
1091 // predicate then we know the casting has no effect.
1092 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1093 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1094 Pred = UncastedPred;
1095
1096 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1097 m_ConstantInt<AArch64SVEPredPattern::all>()));
1098}
1099
1100// Simplify unary operation where predicate has all inactive lanes by replacing
1101// instruction with its operand
1102static std::optional<Instruction *>
1104 bool hasInactiveVector) {
1105 int PredOperand = hasInactiveVector ? 1 : 0;
1106 int ReplaceOperand = hasInactiveVector ? 0 : 1;
1107 if (match(II.getOperand(PredOperand), m_ZeroInt())) {
1108 IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand));
1109 return IC.eraseInstFromFunction(II);
1110 }
1111 return std::nullopt;
1112}
1113
1114// Simplify unary operation where predicate has all inactive lanes or
1115// replace unused first operand with undef when all lanes are active
1116static std::optional<Instruction *>
1118 if (isAllActivePredicate(II.getOperand(1)) &&
1119 !isa<llvm::UndefValue>(II.getOperand(0)) &&
1120 !isa<llvm::PoisonValue>(II.getOperand(0))) {
1121 Value *Undef = llvm::UndefValue::get(II.getType());
1122 return IC.replaceOperand(II, 0, Undef);
1123 }
1124 return instCombineSVENoActiveReplace(IC, II, true);
1125}
1126
1127// Erase unary operation where predicate has all inactive lanes
1128static std::optional<Instruction *>
1130 int PredPos) {
1131 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1132 return IC.eraseInstFromFunction(II);
1133 }
1134 return std::nullopt;
1135}
1136
1137// Simplify operation where predicate has all inactive lanes by replacing
1138// instruction with zeroed object
1139static std::optional<Instruction *>
1141 if (match(II.getOperand(0), m_ZeroInt())) {
1142 Constant *Node;
1143 Type *RetTy = II.getType();
1144 if (RetTy->isStructTy()) {
1145 auto StructT = cast<StructType>(RetTy);
1146 auto VecT = StructT->getElementType(0);
1148 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1149 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1150 : ConstantInt::get(VecT, 0));
1151 }
1152 Node = ConstantStruct::get(StructT, ZerVec);
1153 } else
1154 Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0)
1155 : ConstantInt::get(II.getType(), 0);
1156
1158 return IC.eraseInstFromFunction(II);
1159 }
1160 return std::nullopt;
1161}
1162
1163static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1164 IntrinsicInst &II) {
1165 // svsel(ptrue, x, y) => x
1166 auto *OpPredicate = II.getOperand(0);
1167 if (isAllActivePredicate(OpPredicate))
1168 return IC.replaceInstUsesWith(II, II.getOperand(1));
1169
1170 auto Select =
1171 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1172 return IC.replaceInstUsesWith(II, Select);
1173}
1174
1175static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1176 IntrinsicInst &II) {
1177 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1178 if (!Pg)
1179 return std::nullopt;
1180
1181 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1182 return std::nullopt;
1183
1184 const auto PTruePattern =
1185 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1186 if (PTruePattern != AArch64SVEPredPattern::vl1)
1187 return std::nullopt;
1188
1189 // The intrinsic is inserting into lane zero so use an insert instead.
1190 auto *IdxTy = Type::getInt64Ty(II.getContext());
1191 auto *Insert = InsertElementInst::Create(
1192 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1193 Insert->insertBefore(&II);
1194 Insert->takeName(&II);
1195
1196 return IC.replaceInstUsesWith(II, Insert);
1197}
1198
1199static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1200 IntrinsicInst &II) {
1201 // Replace DupX with a regular IR splat.
1202 auto *RetTy = cast<ScalableVectorType>(II.getType());
1203 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1204 II.getArgOperand(0));
1205 Splat->takeName(&II);
1206 return IC.replaceInstUsesWith(II, Splat);
1207}
1208
1209static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1210 IntrinsicInst &II) {
1211 LLVMContext &Ctx = II.getContext();
1212
1213 // Replace by zero constant when all lanes are inactive
1214 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1215 return II_NA;
1216
1217 // Check that the predicate is all active
1218 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1219 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1220 return std::nullopt;
1221
1222 const auto PTruePattern =
1223 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1224 if (PTruePattern != AArch64SVEPredPattern::all)
1225 return std::nullopt;
1226
1227 // Check that we have a compare of zero..
1228 auto *SplatValue =
1229 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1230 if (!SplatValue || !SplatValue->isZero())
1231 return std::nullopt;
1232
1233 // ..against a dupq
1234 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1235 if (!DupQLane ||
1236 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1237 return std::nullopt;
1238
1239 // Where the dupq is a lane 0 replicate of a vector insert
1240 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
1241 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
1242 return std::nullopt;
1243
1244 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1245 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1246 return std::nullopt;
1247
1248 // Where the vector insert is a fixed constant vector insert into undef at
1249 // index zero
1250 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1251 return std::nullopt;
1252
1253 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1254 return std::nullopt;
1255
1256 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1257 if (!ConstVec)
1258 return std::nullopt;
1259
1260 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1261 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1262 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1263 return std::nullopt;
1264
1265 unsigned NumElts = VecTy->getNumElements();
1266 unsigned PredicateBits = 0;
1267
1268 // Expand intrinsic operands to a 16-bit byte level predicate
1269 for (unsigned I = 0; I < NumElts; ++I) {
1270 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1271 if (!Arg)
1272 return std::nullopt;
1273 if (!Arg->isZero())
1274 PredicateBits |= 1 << (I * (16 / NumElts));
1275 }
1276
1277 // If all bits are zero bail early with an empty predicate
1278 if (PredicateBits == 0) {
1279 auto *PFalse = Constant::getNullValue(II.getType());
1280 PFalse->takeName(&II);
1281 return IC.replaceInstUsesWith(II, PFalse);
1282 }
1283
1284 // Calculate largest predicate type used (where byte predicate is largest)
1285 unsigned Mask = 8;
1286 for (unsigned I = 0; I < 16; ++I)
1287 if ((PredicateBits & (1 << I)) != 0)
1288 Mask |= (I % 8);
1289
1290 unsigned PredSize = Mask & -Mask;
1291 auto *PredType = ScalableVectorType::get(
1292 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1293
1294 // Ensure all relevant bits are set
1295 for (unsigned I = 0; I < 16; I += PredSize)
1296 if ((PredicateBits & (1 << I)) == 0)
1297 return std::nullopt;
1298
1299 auto *PTruePat =
1300 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1301 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1302 {PredType}, {PTruePat});
1303 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1304 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1305 auto *ConvertFromSVBool =
1306 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1307 {II.getType()}, {ConvertToSVBool});
1308
1309 ConvertFromSVBool->takeName(&II);
1310 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1311}
1312
1313static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1314 IntrinsicInst &II) {
1315 Value *Pg = II.getArgOperand(0);
1316 Value *Vec = II.getArgOperand(1);
1317 auto IntrinsicID = II.getIntrinsicID();
1318 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1319
1320 // lastX(splat(X)) --> X
1321 if (auto *SplatVal = getSplatValue(Vec))
1322 return IC.replaceInstUsesWith(II, SplatVal);
1323
1324 // If x and/or y is a splat value then:
1325 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1326 Value *LHS, *RHS;
1327 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1328 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1329 auto *OldBinOp = cast<BinaryOperator>(Vec);
1330 auto OpC = OldBinOp->getOpcode();
1331 auto *NewLHS =
1332 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1333 auto *NewRHS =
1334 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1336 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1337 return IC.replaceInstUsesWith(II, NewBinOp);
1338 }
1339 }
1340
1341 auto *C = dyn_cast<Constant>(Pg);
1342 if (IsAfter && C && C->isNullValue()) {
1343 // The intrinsic is extracting lane 0 so use an extract instead.
1344 auto *IdxTy = Type::getInt64Ty(II.getContext());
1345 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1346 Extract->insertBefore(&II);
1347 Extract->takeName(&II);
1348 return IC.replaceInstUsesWith(II, Extract);
1349 }
1350
1351 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1352 if (!IntrPG)
1353 return std::nullopt;
1354
1355 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1356 return std::nullopt;
1357
1358 const auto PTruePattern =
1359 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1360
1361 // Can the intrinsic's predicate be converted to a known constant index?
1362 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1363 if (!MinNumElts)
1364 return std::nullopt;
1365
1366 unsigned Idx = MinNumElts - 1;
1367 // Increment the index if extracting the element after the last active
1368 // predicate element.
1369 if (IsAfter)
1370 ++Idx;
1371
1372 // Ignore extracts whose index is larger than the known minimum vector
1373 // length. NOTE: This is an artificial constraint where we prefer to
1374 // maintain what the user asked for until an alternative is proven faster.
1375 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1376 if (Idx >= PgVTy->getMinNumElements())
1377 return std::nullopt;
1378
1379 // The intrinsic is extracting a fixed lane so use an extract instead.
1380 auto *IdxTy = Type::getInt64Ty(II.getContext());
1381 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1382 Extract->insertBefore(&II);
1383 Extract->takeName(&II);
1384 return IC.replaceInstUsesWith(II, Extract);
1385}
1386
1387static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1388 IntrinsicInst &II) {
1389 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1390 // integer variant across a variety of micro-architectures. Replace scalar
1391 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1392 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1393 // depending on the micro-architecture, but has been observed as generally
1394 // being faster, particularly when the CLAST[AB] op is a loop-carried
1395 // dependency.
1396 Value *Pg = II.getArgOperand(0);
1397 Value *Fallback = II.getArgOperand(1);
1398 Value *Vec = II.getArgOperand(2);
1399 Type *Ty = II.getType();
1400
1401 if (!Ty->isIntegerTy())
1402 return std::nullopt;
1403
1404 Type *FPTy;
1405 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1406 default:
1407 return std::nullopt;
1408 case 16:
1409 FPTy = IC.Builder.getHalfTy();
1410 break;
1411 case 32:
1412 FPTy = IC.Builder.getFloatTy();
1413 break;
1414 case 64:
1415 FPTy = IC.Builder.getDoubleTy();
1416 break;
1417 }
1418
1419 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1420 auto *FPVTy = VectorType::get(
1421 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1422 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1423 auto *FPII = IC.Builder.CreateIntrinsic(
1424 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1425 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1426 return IC.replaceInstUsesWith(II, FPIItoInt);
1427}
1428
1429static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1430 IntrinsicInst &II) {
1431 LLVMContext &Ctx = II.getContext();
1432 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1433 // can work with RDFFR_PP for ptest elimination.
1434 auto *AllPat =
1435 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1436 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1437 {II.getType()}, {AllPat});
1438 auto *RDFFR =
1439 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1440 RDFFR->takeName(&II);
1441 return IC.replaceInstUsesWith(II, RDFFR);
1442}
1443
1444static std::optional<Instruction *>
1446 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1447
1448 if (Pattern == AArch64SVEPredPattern::all) {
1449 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1450 auto *VScale = IC.Builder.CreateVScale(StepVal);
1451 VScale->takeName(&II);
1452 return IC.replaceInstUsesWith(II, VScale);
1453 }
1454
1455 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1456
1457 return MinNumElts && NumElts >= MinNumElts
1458 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1459 II, ConstantInt::get(II.getType(), MinNumElts)))
1460 : std::nullopt;
1461}
1462
1463static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1464 IntrinsicInst &II) {
1465 Value *PgVal = II.getArgOperand(0);
1466 Value *OpVal = II.getArgOperand(1);
1467
1468 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1469 // Later optimizations prefer this form.
1470 if (PgVal == OpVal &&
1471 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1472 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1473 Value *Ops[] = {PgVal, OpVal};
1474 Type *Tys[] = {PgVal->getType()};
1475
1476 auto *PTest =
1477 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1478 PTest->takeName(&II);
1479
1480 return IC.replaceInstUsesWith(II, PTest);
1481 }
1482
1483 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1484 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1485
1486 if (!Pg || !Op)
1487 return std::nullopt;
1488
1489 Intrinsic::ID OpIID = Op->getIntrinsicID();
1490
1491 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1492 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1493 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1494 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1495 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1496
1497 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1498
1499 PTest->takeName(&II);
1500 return IC.replaceInstUsesWith(II, PTest);
1501 }
1502
1503 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1504 // Later optimizations may rewrite sequence to use the flag-setting variant
1505 // of instruction X to remove PTEST.
1506 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1507 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1508 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1509 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1510 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1511 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1512 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1513 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1514 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1515 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1516 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1517 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1518 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1519 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1520 Type *Tys[] = {Pg->getType()};
1521
1522 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1523 PTest->takeName(&II);
1524
1525 return IC.replaceInstUsesWith(II, PTest);
1526 }
1527
1528 return std::nullopt;
1529}
1530
1531template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1532static std::optional<Instruction *>
1534 bool MergeIntoAddendOp) {
1535 Value *P = II.getOperand(0);
1536 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1537 if (MergeIntoAddendOp) {
1538 AddendOp = II.getOperand(1);
1539 Mul = II.getOperand(2);
1540 } else {
1541 AddendOp = II.getOperand(2);
1542 Mul = II.getOperand(1);
1543 }
1544
1545 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1546 m_Value(MulOp1))))
1547 return std::nullopt;
1548
1549 if (!Mul->hasOneUse())
1550 return std::nullopt;
1551
1552 Instruction *FMFSource = nullptr;
1553 if (II.getType()->isFPOrFPVectorTy()) {
1554 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1555 // Stop the combine when the flags on the inputs differ in case dropping
1556 // flags would lead to us missing out on more beneficial optimizations.
1557 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1558 return std::nullopt;
1559 if (!FAddFlags.allowContract())
1560 return std::nullopt;
1561 FMFSource = &II;
1562 }
1563
1564 CallInst *Res;
1565 if (MergeIntoAddendOp)
1566 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1567 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1568 else
1569 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1570 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1571
1572 return IC.replaceInstUsesWith(II, Res);
1573}
1574
1575static std::optional<Instruction *>
1577 Value *Pred = II.getOperand(0);
1578 Value *PtrOp = II.getOperand(1);
1579 Type *VecTy = II.getType();
1580
1581 // Replace by zero constant when all lanes are inactive
1582 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1583 return II_NA;
1584
1585 if (isAllActivePredicate(Pred)) {
1586 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1587 Load->copyMetadata(II);
1588 return IC.replaceInstUsesWith(II, Load);
1589 }
1590
1591 CallInst *MaskedLoad =
1592 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1593 Pred, ConstantAggregateZero::get(VecTy));
1594 MaskedLoad->copyMetadata(II);
1595 return IC.replaceInstUsesWith(II, MaskedLoad);
1596}
1597
1598static std::optional<Instruction *>
1600 Value *VecOp = II.getOperand(0);
1601 Value *Pred = II.getOperand(1);
1602 Value *PtrOp = II.getOperand(2);
1603
1604 if (isAllActivePredicate(Pred)) {
1605 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1606 Store->copyMetadata(II);
1607 return IC.eraseInstFromFunction(II);
1608 }
1609
1610 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1611 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1612 MaskedStore->copyMetadata(II);
1613 return IC.eraseInstFromFunction(II);
1614}
1615
1617 switch (Intrinsic) {
1618 case Intrinsic::aarch64_sve_fmul_u:
1619 return Instruction::BinaryOps::FMul;
1620 case Intrinsic::aarch64_sve_fadd_u:
1621 return Instruction::BinaryOps::FAdd;
1622 case Intrinsic::aarch64_sve_fsub_u:
1623 return Instruction::BinaryOps::FSub;
1624 default:
1625 return Instruction::BinaryOpsEnd;
1626 }
1627}
1628
1629static std::optional<Instruction *>
1631 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1632 if (II.isStrictFP())
1633 return std::nullopt;
1634
1635 auto *OpPredicate = II.getOperand(0);
1636 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1637 if (BinOpCode == Instruction::BinaryOpsEnd ||
1638 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1639 m_ConstantInt<AArch64SVEPredPattern::all>())))
1640 return std::nullopt;
1641 auto BinOp = IC.Builder.CreateBinOpFMF(
1642 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
1643 return IC.replaceInstUsesWith(II, BinOp);
1644}
1645
1646// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1647// sve.add_u).
1648static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1649 Intrinsic::ID IID) {
1650 auto *OpPredicate = II.getOperand(0);
1651 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1652 m_ConstantInt<AArch64SVEPredPattern::all>())))
1653 return std::nullopt;
1654
1655 auto *Mod = II.getModule();
1656 auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
1657 II.setCalledFunction(NewDecl);
1658
1659 return &II;
1660}
1661
1662// Simplify operations where predicate has all inactive lanes or try to replace
1663// with _u form when all lanes are active
1664static std::optional<Instruction *>
1666 Intrinsic::ID IID) {
1667 if (match(II.getOperand(0), m_ZeroInt())) {
1668 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1669 // inactive for sv[func]_m
1670 return IC.replaceInstUsesWith(II, II.getOperand(1));
1671 }
1672 return instCombineSVEAllActive(II, IID);
1673}
1674
1675static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1676 IntrinsicInst &II) {
1677 if (auto II_U =
1678 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1679 return II_U;
1680 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1681 Intrinsic::aarch64_sve_mla>(
1682 IC, II, true))
1683 return MLA;
1684 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1685 Intrinsic::aarch64_sve_mad>(
1686 IC, II, false))
1687 return MAD;
1688 return std::nullopt;
1689}
1690
1691static std::optional<Instruction *>
1693 if (auto II_U =
1694 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1695 return II_U;
1696 if (auto FMLA =
1697 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1698 Intrinsic::aarch64_sve_fmla>(IC, II,
1699 true))
1700 return FMLA;
1701 if (auto FMAD =
1702 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1703 Intrinsic::aarch64_sve_fmad>(IC, II,
1704 false))
1705 return FMAD;
1706 if (auto FMLA =
1707 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1708 Intrinsic::aarch64_sve_fmla>(IC, II,
1709 true))
1710 return FMLA;
1711 return std::nullopt;
1712}
1713
1714static std::optional<Instruction *>
1716 if (auto FMLA =
1717 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1718 Intrinsic::aarch64_sve_fmla>(IC, II,
1719 true))
1720 return FMLA;
1721 if (auto FMAD =
1722 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1723 Intrinsic::aarch64_sve_fmad>(IC, II,
1724 false))
1725 return FMAD;
1726 if (auto FMLA_U =
1727 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1728 Intrinsic::aarch64_sve_fmla_u>(
1729 IC, II, true))
1730 return FMLA_U;
1731 return instCombineSVEVectorBinOp(IC, II);
1732}
1733
1734static std::optional<Instruction *>
1736 if (auto II_U =
1737 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1738 return II_U;
1739 if (auto FMLS =
1740 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1741 Intrinsic::aarch64_sve_fmls>(IC, II,
1742 true))
1743 return FMLS;
1744 if (auto FMSB =
1745 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1746 Intrinsic::aarch64_sve_fnmsb>(
1747 IC, II, false))
1748 return FMSB;
1749 if (auto FMLS =
1750 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1751 Intrinsic::aarch64_sve_fmls>(IC, II,
1752 true))
1753 return FMLS;
1754 return std::nullopt;
1755}
1756
1757static std::optional<Instruction *>
1759 if (auto FMLS =
1760 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1761 Intrinsic::aarch64_sve_fmls>(IC, II,
1762 true))
1763 return FMLS;
1764 if (auto FMSB =
1765 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1766 Intrinsic::aarch64_sve_fnmsb>(
1767 IC, II, false))
1768 return FMSB;
1769 if (auto FMLS_U =
1770 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1771 Intrinsic::aarch64_sve_fmls_u>(
1772 IC, II, true))
1773 return FMLS_U;
1774 return instCombineSVEVectorBinOp(IC, II);
1775}
1776
1777static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1778 IntrinsicInst &II) {
1779 if (auto II_U =
1780 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1781 return II_U;
1782 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1783 Intrinsic::aarch64_sve_mls>(
1784 IC, II, true))
1785 return MLS;
1786 return std::nullopt;
1787}
1788
1789static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1791 Intrinsic::ID IID) {
1792 auto *OpPredicate = II.getOperand(0);
1793 auto *OpMultiplicand = II.getOperand(1);
1794 auto *OpMultiplier = II.getOperand(2);
1795
1796 // Return true if a given instruction is a unit splat value, false otherwise.
1797 auto IsUnitSplat = [](auto *I) {
1798 auto *SplatValue = getSplatValue(I);
1799 if (!SplatValue)
1800 return false;
1801 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1802 };
1803
1804 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1805 // with a unit splat value, false otherwise.
1806 auto IsUnitDup = [](auto *I) {
1807 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1808 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1809 return false;
1810
1811 auto *SplatValue = IntrI->getOperand(2);
1812 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1813 };
1814
1815 if (IsUnitSplat(OpMultiplier)) {
1816 // [f]mul pg %n, (dupx 1) => %n
1817 OpMultiplicand->takeName(&II);
1818 return IC.replaceInstUsesWith(II, OpMultiplicand);
1819 } else if (IsUnitDup(OpMultiplier)) {
1820 // [f]mul pg %n, (dup pg 1) => %n
1821 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1822 auto *DupPg = DupInst->getOperand(1);
1823 // TODO: this is naive. The optimization is still valid if DupPg
1824 // 'encompasses' OpPredicate, not only if they're the same predicate.
1825 if (OpPredicate == DupPg) {
1826 OpMultiplicand->takeName(&II);
1827 return IC.replaceInstUsesWith(II, OpMultiplicand);
1828 }
1829 }
1830
1831 return instCombineSVEVectorBinOp(IC, II);
1832}
1833
1834static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1835 IntrinsicInst &II) {
1836 Value *UnpackArg = II.getArgOperand(0);
1837 auto *RetTy = cast<ScalableVectorType>(II.getType());
1838 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1839 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1840
1841 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1842 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1843 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1844 ScalarArg =
1845 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1846 Value *NewVal =
1847 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1848 NewVal->takeName(&II);
1849 return IC.replaceInstUsesWith(II, NewVal);
1850 }
1851
1852 return std::nullopt;
1853}
1854static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1855 IntrinsicInst &II) {
1856 auto *OpVal = II.getOperand(0);
1857 auto *OpIndices = II.getOperand(1);
1858 VectorType *VTy = cast<VectorType>(II.getType());
1859
1860 // Check whether OpIndices is a constant splat value < minimal element count
1861 // of result.
1862 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1863 if (!SplatValue ||
1864 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1865 return std::nullopt;
1866
1867 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1868 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1869 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1870 auto *VectorSplat =
1871 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1872
1873 VectorSplat->takeName(&II);
1874 return IC.replaceInstUsesWith(II, VectorSplat);
1875}
1876
1877static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1878 IntrinsicInst &II) {
1879 Value *A, *B;
1880 Type *RetTy = II.getType();
1881 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1882 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1883
1884 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1885 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1886 if ((match(II.getArgOperand(0),
1887 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1888 match(II.getArgOperand(1),
1889 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1890 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1891 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1892 auto *TyA = cast<ScalableVectorType>(A->getType());
1893 if (TyA == B->getType() &&
1895 auto *SubVec = IC.Builder.CreateInsertVector(
1897 auto *ConcatVec = IC.Builder.CreateInsertVector(
1898 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1899 ConcatVec->takeName(&II);
1900 return IC.replaceInstUsesWith(II, ConcatVec);
1901 }
1902 }
1903
1904 return std::nullopt;
1905}
1906
1907static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1908 IntrinsicInst &II) {
1909 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1910 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1911 Value *A, *B;
1912 if (match(II.getArgOperand(0),
1913 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1914 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1915 m_Specific(A), m_Specific(B))))
1916 return IC.replaceInstUsesWith(
1917 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1918
1919 return std::nullopt;
1920}
1921
1922static std::optional<Instruction *>
1924 Value *Mask = II.getOperand(0);
1925 Value *BasePtr = II.getOperand(1);
1926 Value *Index = II.getOperand(2);
1927 Type *Ty = II.getType();
1928 Value *PassThru = ConstantAggregateZero::get(Ty);
1929
1930 // Replace by zero constant when all lanes are inactive
1931 if (auto II_NA = instCombineSVENoActiveZero(IC, II))
1932 return II_NA;
1933
1934 // Contiguous gather => masked load.
1935 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1936 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1937 Value *IndexBase;
1938 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1939 m_Value(IndexBase), m_SpecificInt(1)))) {
1940 Align Alignment =
1941 BasePtr->getPointerAlignment(II.getDataLayout());
1942
1943 Type *VecPtrTy = PointerType::getUnqual(Ty);
1944 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1945 BasePtr, IndexBase);
1946 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1947 CallInst *MaskedLoad =
1948 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1949 MaskedLoad->takeName(&II);
1950 return IC.replaceInstUsesWith(II, MaskedLoad);
1951 }
1952
1953 return std::nullopt;
1954}
1955
1956static std::optional<Instruction *>
1958 Value *Val = II.getOperand(0);
1959 Value *Mask = II.getOperand(1);
1960 Value *BasePtr = II.getOperand(2);
1961 Value *Index = II.getOperand(3);
1962 Type *Ty = Val->getType();
1963
1964 // Contiguous scatter => masked store.
1965 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1966 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1967 Value *IndexBase;
1968 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1969 m_Value(IndexBase), m_SpecificInt(1)))) {
1970 Align Alignment =
1971 BasePtr->getPointerAlignment(II.getDataLayout());
1972
1973 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1974 BasePtr, IndexBase);
1975 Type *VecPtrTy = PointerType::getUnqual(Ty);
1976 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1977
1978 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1979
1980 return IC.eraseInstFromFunction(II);
1981 }
1982
1983 return std::nullopt;
1984}
1985
1986static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1987 IntrinsicInst &II) {
1988 Type *Int32Ty = IC.Builder.getInt32Ty();
1989 Value *Pred = II.getOperand(0);
1990 Value *Vec = II.getOperand(1);
1991 Value *DivVec = II.getOperand(2);
1992
1993 Value *SplatValue = getSplatValue(DivVec);
1994 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1995 if (!SplatConstantInt)
1996 return std::nullopt;
1997
1998 APInt Divisor = SplatConstantInt->getValue();
1999 const int64_t DivisorValue = Divisor.getSExtValue();
2000 if (DivisorValue == -1)
2001 return std::nullopt;
2002 if (DivisorValue == 1)
2003 IC.replaceInstUsesWith(II, Vec);
2004
2005 if (Divisor.isPowerOf2()) {
2006 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2007 auto ASRD = IC.Builder.CreateIntrinsic(
2008 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2009 return IC.replaceInstUsesWith(II, ASRD);
2010 }
2011 if (Divisor.isNegatedPowerOf2()) {
2012 Divisor.negate();
2013 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2014 auto ASRD = IC.Builder.CreateIntrinsic(
2015 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2016 auto NEG = IC.Builder.CreateIntrinsic(
2017 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2018 return IC.replaceInstUsesWith(II, NEG);
2019 }
2020
2021 return std::nullopt;
2022}
2023
2024bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2025 size_t VecSize = Vec.size();
2026 if (VecSize == 1)
2027 return true;
2028 if (!isPowerOf2_64(VecSize))
2029 return false;
2030 size_t HalfVecSize = VecSize / 2;
2031
2032 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2033 RHS != Vec.end(); LHS++, RHS++) {
2034 if (*LHS != nullptr && *RHS != nullptr) {
2035 if (*LHS == *RHS)
2036 continue;
2037 else
2038 return false;
2039 }
2040 if (!AllowPoison)
2041 return false;
2042 if (*LHS == nullptr && *RHS != nullptr)
2043 *LHS = *RHS;
2044 }
2045
2046 Vec.resize(HalfVecSize);
2047 SimplifyValuePattern(Vec, AllowPoison);
2048 return true;
2049}
2050
2051// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2052// to dupqlane(f64(C)) where C is A concatenated with B
2053static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2054 IntrinsicInst &II) {
2055 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2056 if (!match(II.getOperand(0),
2057 m_Intrinsic<Intrinsic::vector_insert>(
2058 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2059 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2060 return std::nullopt;
2061 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2062
2063 // Insert the scalars into a container ordered by InsertElement index
2064 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2065 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2066 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2067 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2068 CurrentInsertElt = InsertElt->getOperand(0);
2069 }
2070
2071 bool AllowPoison =
2072 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2073 if (!SimplifyValuePattern(Elts, AllowPoison))
2074 return std::nullopt;
2075
2076 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2077 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2078 for (size_t I = 0; I < Elts.size(); I++) {
2079 if (Elts[I] == nullptr)
2080 continue;
2081 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2082 IC.Builder.getInt64(I));
2083 }
2084 if (InsertEltChain == nullptr)
2085 return std::nullopt;
2086
2087 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2088 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2089 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2090 // be narrowed back to the original type.
2091 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2092 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2093 IIScalableTy->getMinNumElements() /
2094 PatternWidth;
2095
2096 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2097 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2098 auto *WideShuffleMaskTy =
2099 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2100
2101 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2102 auto InsertSubvector = IC.Builder.CreateInsertVector(
2103 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2104 auto WideBitcast =
2105 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2106 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2107 auto WideShuffle = IC.Builder.CreateShuffleVector(
2108 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2109 auto NarrowBitcast =
2110 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2111
2112 return IC.replaceInstUsesWith(II, NarrowBitcast);
2113}
2114
2115static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2116 IntrinsicInst &II) {
2117 Value *A = II.getArgOperand(0);
2118 Value *B = II.getArgOperand(1);
2119 if (A == B)
2120 return IC.replaceInstUsesWith(II, A);
2121
2122 return std::nullopt;
2123}
2124
2125static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2126 IntrinsicInst &II) {
2127 Value *Pred = II.getOperand(0);
2128 Value *Vec = II.getOperand(1);
2129 Value *Shift = II.getOperand(2);
2130
2131 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2132 Value *AbsPred, *MergedValue;
2133 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2134 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2135 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2136 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2137
2138 return std::nullopt;
2139
2140 // Transform is valid if any of the following are true:
2141 // * The ABS merge value is an undef or non-negative
2142 // * The ABS predicate is all active
2143 // * The ABS predicate and the SRSHL predicates are the same
2144 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2145 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2146 return std::nullopt;
2147
2148 // Only valid when the shift amount is non-negative, otherwise the rounding
2149 // behaviour of SRSHL cannot be ignored.
2150 if (!match(Shift, m_NonNegative()))
2151 return std::nullopt;
2152
2153 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2154 {II.getType()}, {Pred, Vec, Shift});
2155
2156 return IC.replaceInstUsesWith(II, LSL);
2157}
2158
2159static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2160 IntrinsicInst &II) {
2161 Value *Vec = II.getOperand(0);
2162
2163 if (getSplatValue(Vec) == II.getOperand(1))
2164 return IC.replaceInstUsesWith(II, Vec);
2165
2166 return std::nullopt;
2167}
2168
2169static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2170 IntrinsicInst &II) {
2171 // If this barrier is post-dominated by identical one we can remove it
2172 auto *NI = II.getNextNonDebugInstruction();
2173 unsigned LookaheadThreshold = DMBLookaheadThreshold;
2174 auto CanSkipOver = [](Instruction *I) {
2175 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2176 };
2177 while (LookaheadThreshold-- && CanSkipOver(NI)) {
2178 auto *NIBB = NI->getParent();
2179 NI = NI->getNextNonDebugInstruction();
2180 if (!NI) {
2181 if (auto *SuccBB = NIBB->getUniqueSuccessor())
2182 NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
2183 else
2184 break;
2185 }
2186 }
2187 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
2188 if (NextII && II.isIdenticalTo(NextII))
2189 return IC.eraseInstFromFunction(II);
2190
2191 return std::nullopt;
2192}
2193
2194std::optional<Instruction *>
2196 IntrinsicInst &II) const {
2197 Intrinsic::ID IID = II.getIntrinsicID();
2198 switch (IID) {
2199 default:
2200 break;
2201 case Intrinsic::aarch64_dmb:
2202 return instCombineDMB(IC, II);
2203 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
2204 case Intrinsic::aarch64_sve_fcvt_f16f32:
2205 case Intrinsic::aarch64_sve_fcvt_f16f64:
2206 case Intrinsic::aarch64_sve_fcvt_f32f16:
2207 case Intrinsic::aarch64_sve_fcvt_f32f64:
2208 case Intrinsic::aarch64_sve_fcvt_f64f16:
2209 case Intrinsic::aarch64_sve_fcvt_f64f32:
2210 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
2211 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
2212 case Intrinsic::aarch64_sve_fcvtx_f32f64:
2213 case Intrinsic::aarch64_sve_fcvtzs:
2214 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
2215 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
2216 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
2217 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
2218 case Intrinsic::aarch64_sve_fcvtzu:
2219 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
2220 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
2221 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
2222 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
2223 case Intrinsic::aarch64_sve_scvtf:
2224 case Intrinsic::aarch64_sve_scvtf_f16i32:
2225 case Intrinsic::aarch64_sve_scvtf_f16i64:
2226 case Intrinsic::aarch64_sve_scvtf_f32i64:
2227 case Intrinsic::aarch64_sve_scvtf_f64i32:
2228 case Intrinsic::aarch64_sve_ucvtf:
2229 case Intrinsic::aarch64_sve_ucvtf_f16i32:
2230 case Intrinsic::aarch64_sve_ucvtf_f16i64:
2231 case Intrinsic::aarch64_sve_ucvtf_f32i64:
2232 case Intrinsic::aarch64_sve_ucvtf_f64i32:
2234 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
2235 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
2236 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
2237 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
2238 return instCombineSVENoActiveReplace(IC, II, true);
2239 case Intrinsic::aarch64_sve_st1_scatter:
2240 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2241 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2242 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2243 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2244 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2245 case Intrinsic::aarch64_sve_st1dq:
2246 case Intrinsic::aarch64_sve_st1q_scatter_index:
2247 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2248 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2249 case Intrinsic::aarch64_sve_st1wq:
2250 case Intrinsic::aarch64_sve_stnt1:
2251 case Intrinsic::aarch64_sve_stnt1_scatter:
2252 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2253 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2254 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2255 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2256 case Intrinsic::aarch64_sve_st2:
2257 case Intrinsic::aarch64_sve_st2q:
2258 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2259 case Intrinsic::aarch64_sve_st3:
2260 case Intrinsic::aarch64_sve_st3q:
2261 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2262 case Intrinsic::aarch64_sve_st4:
2263 case Intrinsic::aarch64_sve_st4q:
2264 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2265 case Intrinsic::aarch64_sve_addqv:
2266 case Intrinsic::aarch64_sve_and_z:
2267 case Intrinsic::aarch64_sve_bic_z:
2268 case Intrinsic::aarch64_sve_brka_z:
2269 case Intrinsic::aarch64_sve_brkb_z:
2270 case Intrinsic::aarch64_sve_brkn_z:
2271 case Intrinsic::aarch64_sve_brkpa_z:
2272 case Intrinsic::aarch64_sve_brkpb_z:
2273 case Intrinsic::aarch64_sve_cntp:
2274 case Intrinsic::aarch64_sve_compact:
2275 case Intrinsic::aarch64_sve_eor_z:
2276 case Intrinsic::aarch64_sve_eorv:
2277 case Intrinsic::aarch64_sve_eorqv:
2278 case Intrinsic::aarch64_sve_nand_z:
2279 case Intrinsic::aarch64_sve_nor_z:
2280 case Intrinsic::aarch64_sve_orn_z:
2281 case Intrinsic::aarch64_sve_orr_z:
2282 case Intrinsic::aarch64_sve_orv:
2283 case Intrinsic::aarch64_sve_orqv:
2284 case Intrinsic::aarch64_sve_pnext:
2285 case Intrinsic::aarch64_sve_rdffr_z:
2286 case Intrinsic::aarch64_sve_saddv:
2287 case Intrinsic::aarch64_sve_uaddv:
2288 case Intrinsic::aarch64_sve_umaxv:
2289 case Intrinsic::aarch64_sve_umaxqv:
2290 case Intrinsic::aarch64_sve_cmpeq:
2291 case Intrinsic::aarch64_sve_cmpeq_wide:
2292 case Intrinsic::aarch64_sve_cmpge:
2293 case Intrinsic::aarch64_sve_cmpge_wide:
2294 case Intrinsic::aarch64_sve_cmpgt:
2295 case Intrinsic::aarch64_sve_cmpgt_wide:
2296 case Intrinsic::aarch64_sve_cmphi:
2297 case Intrinsic::aarch64_sve_cmphi_wide:
2298 case Intrinsic::aarch64_sve_cmphs:
2299 case Intrinsic::aarch64_sve_cmphs_wide:
2300 case Intrinsic::aarch64_sve_cmple_wide:
2301 case Intrinsic::aarch64_sve_cmplo_wide:
2302 case Intrinsic::aarch64_sve_cmpls_wide:
2303 case Intrinsic::aarch64_sve_cmplt_wide:
2304 case Intrinsic::aarch64_sve_facge:
2305 case Intrinsic::aarch64_sve_facgt:
2306 case Intrinsic::aarch64_sve_fcmpeq:
2307 case Intrinsic::aarch64_sve_fcmpge:
2308 case Intrinsic::aarch64_sve_fcmpgt:
2309 case Intrinsic::aarch64_sve_fcmpne:
2310 case Intrinsic::aarch64_sve_fcmpuo:
2311 case Intrinsic::aarch64_sve_ld1_gather:
2312 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2313 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2314 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2315 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2316 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2317 case Intrinsic::aarch64_sve_ld1q_gather_index:
2318 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2319 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2320 case Intrinsic::aarch64_sve_ld1ro:
2321 case Intrinsic::aarch64_sve_ld1rq:
2322 case Intrinsic::aarch64_sve_ld1udq:
2323 case Intrinsic::aarch64_sve_ld1uwq:
2324 case Intrinsic::aarch64_sve_ld2_sret:
2325 case Intrinsic::aarch64_sve_ld2q_sret:
2326 case Intrinsic::aarch64_sve_ld3_sret:
2327 case Intrinsic::aarch64_sve_ld3q_sret:
2328 case Intrinsic::aarch64_sve_ld4_sret:
2329 case Intrinsic::aarch64_sve_ld4q_sret:
2330 case Intrinsic::aarch64_sve_ldff1:
2331 case Intrinsic::aarch64_sve_ldff1_gather:
2332 case Intrinsic::aarch64_sve_ldff1_gather_index:
2333 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2334 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2335 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2336 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2337 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2338 case Intrinsic::aarch64_sve_ldnf1:
2339 case Intrinsic::aarch64_sve_ldnt1:
2340 case Intrinsic::aarch64_sve_ldnt1_gather:
2341 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2342 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2343 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2344 return instCombineSVENoActiveZero(IC, II);
2345 case Intrinsic::aarch64_sve_prf:
2346 case Intrinsic::aarch64_sve_prfb_gather_index:
2347 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2348 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2349 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2350 case Intrinsic::aarch64_sve_prfd_gather_index:
2351 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2352 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2353 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2354 case Intrinsic::aarch64_sve_prfh_gather_index:
2355 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2356 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2357 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2358 case Intrinsic::aarch64_sve_prfw_gather_index:
2359 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2360 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2361 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2362 return instCombineSVENoActiveUnaryErase(IC, II, 0);
2363 case Intrinsic::aarch64_neon_fmaxnm:
2364 case Intrinsic::aarch64_neon_fminnm:
2365 return instCombineMaxMinNM(IC, II);
2366 case Intrinsic::aarch64_sve_convert_from_svbool:
2367 return instCombineConvertFromSVBool(IC, II);
2368 case Intrinsic::aarch64_sve_dup:
2369 return instCombineSVEDup(IC, II);
2370 case Intrinsic::aarch64_sve_dup_x:
2371 return instCombineSVEDupX(IC, II);
2372 case Intrinsic::aarch64_sve_cmpne:
2373 case Intrinsic::aarch64_sve_cmpne_wide:
2374 return instCombineSVECmpNE(IC, II);
2375 case Intrinsic::aarch64_sve_rdffr:
2376 return instCombineRDFFR(IC, II);
2377 case Intrinsic::aarch64_sve_lasta:
2378 case Intrinsic::aarch64_sve_lastb:
2379 return instCombineSVELast(IC, II);
2380 case Intrinsic::aarch64_sve_clasta_n:
2381 case Intrinsic::aarch64_sve_clastb_n:
2382 return instCombineSVECondLast(IC, II);
2383 case Intrinsic::aarch64_sve_cntd:
2384 return instCombineSVECntElts(IC, II, 2);
2385 case Intrinsic::aarch64_sve_cntw:
2386 return instCombineSVECntElts(IC, II, 4);
2387 case Intrinsic::aarch64_sve_cnth:
2388 return instCombineSVECntElts(IC, II, 8);
2389 case Intrinsic::aarch64_sve_cntb:
2390 return instCombineSVECntElts(IC, II, 16);
2391 case Intrinsic::aarch64_sve_ptest_any:
2392 case Intrinsic::aarch64_sve_ptest_first:
2393 case Intrinsic::aarch64_sve_ptest_last:
2394 return instCombineSVEPTest(IC, II);
2395 case Intrinsic::aarch64_sve_fabd:
2396 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2397 case Intrinsic::aarch64_sve_fadd:
2398 return instCombineSVEVectorFAdd(IC, II);
2399 case Intrinsic::aarch64_sve_fadd_u:
2400 return instCombineSVEVectorFAddU(IC, II);
2401 case Intrinsic::aarch64_sve_fdiv:
2402 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2403 case Intrinsic::aarch64_sve_fmax:
2404 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2405 case Intrinsic::aarch64_sve_fmaxnm:
2406 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2407 case Intrinsic::aarch64_sve_fmin:
2408 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2409 case Intrinsic::aarch64_sve_fminnm:
2410 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2411 case Intrinsic::aarch64_sve_fmla:
2412 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2413 case Intrinsic::aarch64_sve_fmls:
2414 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2415 case Intrinsic::aarch64_sve_fmul:
2416 if (auto II_U =
2417 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2418 return II_U;
2419 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2420 case Intrinsic::aarch64_sve_fmul_u:
2421 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2422 case Intrinsic::aarch64_sve_fmulx:
2423 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2424 case Intrinsic::aarch64_sve_fnmla:
2425 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2426 case Intrinsic::aarch64_sve_fnmls:
2427 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2428 case Intrinsic::aarch64_sve_fsub:
2429 return instCombineSVEVectorFSub(IC, II);
2430 case Intrinsic::aarch64_sve_fsub_u:
2431 return instCombineSVEVectorFSubU(IC, II);
2432 case Intrinsic::aarch64_sve_add:
2433 return instCombineSVEVectorAdd(IC, II);
2434 case Intrinsic::aarch64_sve_add_u:
2435 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2436 Intrinsic::aarch64_sve_mla_u>(
2437 IC, II, true);
2438 case Intrinsic::aarch64_sve_mla:
2439 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2440 case Intrinsic::aarch64_sve_mls:
2441 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2442 case Intrinsic::aarch64_sve_mul:
2443 if (auto II_U =
2444 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2445 return II_U;
2446 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2447 case Intrinsic::aarch64_sve_mul_u:
2448 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2449 case Intrinsic::aarch64_sve_sabd:
2450 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2451 case Intrinsic::aarch64_sve_smax:
2452 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2453 case Intrinsic::aarch64_sve_smin:
2454 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2455 case Intrinsic::aarch64_sve_smulh:
2456 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2457 case Intrinsic::aarch64_sve_sub:
2458 return instCombineSVEVectorSub(IC, II);
2459 case Intrinsic::aarch64_sve_sub_u:
2460 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2461 Intrinsic::aarch64_sve_mls_u>(
2462 IC, II, true);
2463 case Intrinsic::aarch64_sve_uabd:
2464 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2465 case Intrinsic::aarch64_sve_umax:
2466 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2467 case Intrinsic::aarch64_sve_umin:
2468 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2469 case Intrinsic::aarch64_sve_umulh:
2470 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2471 case Intrinsic::aarch64_sve_asr:
2472 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2473 case Intrinsic::aarch64_sve_lsl:
2474 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2475 case Intrinsic::aarch64_sve_lsr:
2476 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2477 case Intrinsic::aarch64_sve_and:
2478 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2479 case Intrinsic::aarch64_sve_bic:
2480 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2481 case Intrinsic::aarch64_sve_eor:
2482 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2483 case Intrinsic::aarch64_sve_orr:
2484 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2485 case Intrinsic::aarch64_sve_sqsub:
2486 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2487 case Intrinsic::aarch64_sve_uqsub:
2488 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2489 case Intrinsic::aarch64_sve_tbl:
2490 return instCombineSVETBL(IC, II);
2491 case Intrinsic::aarch64_sve_uunpkhi:
2492 case Intrinsic::aarch64_sve_uunpklo:
2493 case Intrinsic::aarch64_sve_sunpkhi:
2494 case Intrinsic::aarch64_sve_sunpklo:
2495 return instCombineSVEUnpack(IC, II);
2496 case Intrinsic::aarch64_sve_uzp1:
2497 return instCombineSVEUzp1(IC, II);
2498 case Intrinsic::aarch64_sve_zip1:
2499 case Intrinsic::aarch64_sve_zip2:
2500 return instCombineSVEZip(IC, II);
2501 case Intrinsic::aarch64_sve_ld1_gather_index:
2502 return instCombineLD1GatherIndex(IC, II);
2503 case Intrinsic::aarch64_sve_st1_scatter_index:
2504 return instCombineST1ScatterIndex(IC, II);
2505 case Intrinsic::aarch64_sve_ld1:
2506 return instCombineSVELD1(IC, II, DL);
2507 case Intrinsic::aarch64_sve_st1:
2508 return instCombineSVEST1(IC, II, DL);
2509 case Intrinsic::aarch64_sve_sdiv:
2510 return instCombineSVESDIV(IC, II);
2511 case Intrinsic::aarch64_sve_sel:
2512 return instCombineSVESel(IC, II);
2513 case Intrinsic::aarch64_sve_srshl:
2514 return instCombineSVESrshl(IC, II);
2515 case Intrinsic::aarch64_sve_dupq_lane:
2516 return instCombineSVEDupqLane(IC, II);
2517 case Intrinsic::aarch64_sve_insr:
2518 return instCombineSVEInsr(IC, II);
2519 }
2520
2521 return std::nullopt;
2522}
2523
2525 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2526 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2527 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2528 SimplifyAndSetOp) const {
2529 switch (II.getIntrinsicID()) {
2530 default:
2531 break;
2532 case Intrinsic::aarch64_neon_fcvtxn:
2533 case Intrinsic::aarch64_neon_rshrn:
2534 case Intrinsic::aarch64_neon_sqrshrn:
2535 case Intrinsic::aarch64_neon_sqrshrun:
2536 case Intrinsic::aarch64_neon_sqshrn:
2537 case Intrinsic::aarch64_neon_sqshrun:
2538 case Intrinsic::aarch64_neon_sqxtn:
2539 case Intrinsic::aarch64_neon_sqxtun:
2540 case Intrinsic::aarch64_neon_uqrshrn:
2541 case Intrinsic::aarch64_neon_uqshrn:
2542 case Intrinsic::aarch64_neon_uqxtn:
2543 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2544 break;
2545 }
2546
2547 return std::nullopt;
2548}
2549
2551 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2553}
2554
2557 switch (K) {
2559 return TypeSize::getFixed(64);
2561 if (ST->useSVEForFixedLengthVectors() &&
2563 return TypeSize::getFixed(
2564 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2565 else if (ST->isNeonAvailable())
2566 return TypeSize::getFixed(128);
2567 else
2568 return TypeSize::getFixed(0);
2570 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2572 return TypeSize::getScalable(128);
2573 else
2574 return TypeSize::getScalable(0);
2575 }
2576 llvm_unreachable("Unsupported register kind");
2577}
2578
2579bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2581 Type *SrcOverrideTy) {
2582 // A helper that returns a vector type from the given type. The number of
2583 // elements in type Ty determines the vector width.
2584 auto toVectorTy = [&](Type *ArgTy) {
2585 return VectorType::get(ArgTy->getScalarType(),
2586 cast<VectorType>(DstTy)->getElementCount());
2587 };
2588
2589 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2590 // i32, i64]. SVE doesn't generally have the same set of instructions to
2591 // perform an extend with the add/sub/mul. There are SMULLB style
2592 // instructions, but they operate on top/bottom, requiring some sort of lane
2593 // interleaving to be used with zext/sext.
2594 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2595 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2596 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2597 return false;
2598
2599 // Determine if the operation has a widening variant. We consider both the
2600 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2601 // instructions.
2602 //
2603 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2604 // verify that their extending operands are eliminated during code
2605 // generation.
2606 Type *SrcTy = SrcOverrideTy;
2607 switch (Opcode) {
2608 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2609 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2610 // The second operand needs to be an extend
2611 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2612 if (!SrcTy)
2613 SrcTy =
2614 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2615 } else
2616 return false;
2617 break;
2618 case Instruction::Mul: { // SMULL(2), UMULL(2)
2619 // Both operands need to be extends of the same type.
2620 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2621 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2622 if (!SrcTy)
2623 SrcTy =
2624 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2625 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2626 // If one of the operands is a Zext and the other has enough zero bits to
2627 // be treated as unsigned, we can still general a umull, meaning the zext
2628 // is free.
2629 KnownBits Known =
2630 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2631 if (Args[0]->getType()->getScalarSizeInBits() -
2632 Known.Zero.countLeadingOnes() >
2633 DstTy->getScalarSizeInBits() / 2)
2634 return false;
2635 if (!SrcTy)
2636 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2637 DstTy->getScalarSizeInBits() / 2));
2638 } else
2639 return false;
2640 break;
2641 }
2642 default:
2643 return false;
2644 }
2645
2646 // Legalize the destination type and ensure it can be used in a widening
2647 // operation.
2648 auto DstTyL = getTypeLegalizationCost(DstTy);
2649 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2650 return false;
2651
2652 // Legalize the source type and ensure it can be used in a widening
2653 // operation.
2654 assert(SrcTy && "Expected some SrcTy");
2655 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2656 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2657 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2658 return false;
2659
2660 // Get the total number of vector elements in the legalized types.
2661 InstructionCost NumDstEls =
2662 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2663 InstructionCost NumSrcEls =
2664 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2665
2666 // Return true if the legalized types have the same number of vector elements
2667 // and the destination element type size is twice that of the source type.
2668 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2669}
2670
2671// s/urhadd instructions implement the following pattern, making the
2672// extends free:
2673// %x = add ((zext i8 -> i16), 1)
2674// %y = (zext i8 -> i16)
2675// trunc i16 (lshr (add %x, %y), 1) -> i8
2676//
2678 Type *Src) {
2679 // The source should be a legal vector type.
2680 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2681 (Src->isScalableTy() && !ST->hasSVE2()))
2682 return false;
2683
2684 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2685 return false;
2686
2687 // Look for trunc/shl/add before trying to match the pattern.
2688 const Instruction *Add = ExtUser;
2689 auto *AddUser =
2690 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2691 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2692 Add = AddUser;
2693
2694 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2695 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2696 return false;
2697
2698 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2699 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2700 Src->getScalarSizeInBits() !=
2701 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2702 return false;
2703
2704 // Try to match the whole pattern. Ext could be either the first or second
2705 // m_ZExtOrSExt matched.
2706 Instruction *Ex1, *Ex2;
2707 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2708 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2709 return false;
2710
2711 // Ensure both extends are of the same type
2712 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2713 Ex1->getOpcode() == Ex2->getOpcode())
2714 return true;
2715
2716 return false;
2717}
2718
2720 Type *Src,
2723 const Instruction *I) {
2724 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2725 assert(ISD && "Invalid opcode");
2726 // If the cast is observable, and it is used by a widening instruction (e.g.,
2727 // uaddl, saddw, etc.), it may be free.
2728 if (I && I->hasOneUser()) {
2729 auto *SingleUser = cast<Instruction>(*I->user_begin());
2730 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2731 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2732 // For adds only count the second operand as free if both operands are
2733 // extends but not the same operation. (i.e both operands are not free in
2734 // add(sext, zext)).
2735 if (SingleUser->getOpcode() == Instruction::Add) {
2736 if (I == SingleUser->getOperand(1) ||
2737 (isa<CastInst>(SingleUser->getOperand(1)) &&
2738 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2739 return 0;
2740 } else // Others are free so long as isWideningInstruction returned true.
2741 return 0;
2742 }
2743
2744 // The cast will be free for the s/urhadd instructions
2745 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2746 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2747 return 0;
2748 }
2749
2750 // TODO: Allow non-throughput costs that aren't binary.
2751 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2753 return Cost == 0 ? 0 : 1;
2754 return Cost;
2755 };
2756
2757 EVT SrcTy = TLI->getValueType(DL, Src);
2758 EVT DstTy = TLI->getValueType(DL, Dst);
2759
2760 if (!SrcTy.isSimple() || !DstTy.isSimple())
2761 return AdjustCost(
2762 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2763
2764 static const TypeConversionCostTblEntry BF16Tbl[] = {
2765 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
2766 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
2767 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
2768 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
2769 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
2770 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
2771 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
2772 };
2773
2774 if (ST->hasBF16())
2775 if (const auto *Entry = ConvertCostTableLookup(
2776 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2777 return AdjustCost(Entry->Cost);
2778
2779 static const TypeConversionCostTblEntry ConversionTbl[] = {
2780 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2781 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2782 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2783 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2784 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2785 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2786 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2787 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2788 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2789 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2790 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2791 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2792 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2793 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2794 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2795 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2796 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2797 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2798 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2799 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2800
2801 // Truncations on nxvmiN
2802 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
2803 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
2804 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
2805 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
2806 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
2807 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
2808 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
2809 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
2810 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
2811 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
2812 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
2813 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
2814 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
2815 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
2816 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
2817 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
2818 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
2819 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
2820 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
2821 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
2822 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
2823 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
2824 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
2825 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
2826 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
2827 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
2828 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
2829 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
2830 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
2831 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
2832 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
2833 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
2834 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
2835
2836 // The number of shll instructions for the extension.
2837 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2838 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
2839 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2840 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
2841 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2842 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
2843 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2844 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
2845 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2846 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
2847 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2848 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
2849 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2850 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
2851 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2852 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
2853
2854 // FP Ext and trunc
2855 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
2856 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
2857 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
2858 // FP16
2859 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
2860 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
2861 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
2862 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
2863 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
2864 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
2865 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
2866 // BF16 (uses shift)
2867 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
2868 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
2869 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
2870 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
2871 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
2872 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
2873 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
2874 // FP Ext and trunc
2875 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
2876 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
2877 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
2878 // FP16
2879 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
2880 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
2881 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
2882 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
2883 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
2884 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
2885 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
2886 // BF16 (more complex, with +bf16 is handled above)
2887 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
2888 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
2889 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
2890 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
2891 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
2892 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
2893 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
2894 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
2895
2896 // LowerVectorINT_TO_FP:
2897 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2898 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2899 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2900 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
2901 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
2902 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
2903
2904 // Complex: to v2f32
2905 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2906 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2907 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2908 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
2909 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
2910 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2},
2911
2912 // Complex: to v4f32
2913 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
2914 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2915 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
2916 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
2917
2918 // Complex: to v8f32
2919 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2920 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2921 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
2922 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
2923
2924 // Complex: to v16f32
2925 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2926 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
2927
2928 // Complex: to v2f64
2929 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2930 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2931 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2932 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
2933 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
2934 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
2935
2936 // Complex: to v4f64
2937 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2938 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
2939
2940 // LowerVectorFP_TO_INT
2941 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
2942 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
2943 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
2944 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
2945 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
2946 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
2947
2948 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2949 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
2950 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
2951 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
2952 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
2953 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
2954 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
2955
2956 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2957 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
2958 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
2959 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
2960 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
2961
2962 // Complex, from nxv2f32.
2963 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2964 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2965 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2966 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2967 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
2968 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
2969 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
2970 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
2971
2972 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2973 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
2974 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
2975 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
2976 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
2977 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
2978 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
2979
2980 // Complex, from nxv2f64.
2981 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2982 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2983 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2984 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2985 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
2986 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
2987 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
2988 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
2989
2990 // Complex, from nxv4f32.
2991 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
2992 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
2993 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
2994 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
2995 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
2996 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
2997 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
2998 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
2999
3000 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3001 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3002 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3003 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3004 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3005
3006 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3007 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3008 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3009 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3010 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3011 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3012 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3013
3014 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3015 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3016 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3017 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3018 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3019
3020 // Complex, from nxv8f16.
3021 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3022 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3023 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3024 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3025 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3026 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3027 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3028 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3029
3030 // Complex, from nxv4f16.
3031 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3032 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3033 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3034 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3035 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3036 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3037 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3038 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3039
3040 // Complex, from nxv2f16.
3041 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3042 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3043 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3044 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3045 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3046 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3047 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3048 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3049
3050 // Truncate from nxvmf32 to nxvmf16.
3051 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3052 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3053 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3054
3055 // Truncate from nxvmf64 to nxvmf16.
3056 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
3057 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
3058 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
3059
3060 // Truncate from nxvmf64 to nxvmf32.
3061 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
3062 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
3063 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
3064
3065 // Extend from nxvmf16 to nxvmf32.
3066 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
3067 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
3068 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
3069
3070 // Extend from nxvmf16 to nxvmf64.
3071 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
3072 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
3073 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
3074
3075 // Extend from nxvmf32 to nxvmf64.
3076 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
3077 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
3078 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
3079
3080 // Bitcasts from float to integer
3081 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
3082 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
3083 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
3084
3085 // Bitcasts from integer to float
3086 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
3087 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
3088 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
3089
3090 // Add cost for extending to illegal -too wide- scalable vectors.
3091 // zero/sign extend are implemented by multiple unpack operations,
3092 // where each operation has a cost of 1.
3093 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3094 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3095 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3096 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3097 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3098 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3099
3100 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
3101 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
3102 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
3103 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
3104 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
3105 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
3106 };
3107
3108 // We have to estimate a cost of fixed length operation upon
3109 // SVE registers(operations) with the number of registers required
3110 // for a fixed type to be represented upon SVE registers.
3111 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3112 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3113 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3114 ST->useSVEForFixedLengthVectors(WiderTy)) {
3115 std::pair<InstructionCost, MVT> LT =
3116 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3117 unsigned NumElements =
3118 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3119 return AdjustCost(
3120 LT.first *
3122 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
3123 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3124 CostKind, I));
3125 }
3126
3127 if (const auto *Entry = ConvertCostTableLookup(
3128 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3129 return AdjustCost(Entry->Cost);
3130
3131 static const TypeConversionCostTblEntry FP16Tbl[] = {
3132 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
3133 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
3134 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
3135 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
3136 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
3137 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
3138 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
3139 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
3140 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
3141 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
3142 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
3143 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
3144 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
3145 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
3146 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
3147 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
3148 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
3149 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
3150 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
3151 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
3152 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
3153 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
3154 };
3155
3156 if (ST->hasFullFP16())
3157 if (const auto *Entry = ConvertCostTableLookup(
3158 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3159 return AdjustCost(Entry->Cost);
3160
3161 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3164 TLI->getTypeAction(Src->getContext(), SrcTy) ==
3166 TLI->getTypeAction(Dst->getContext(), DstTy) ==
3168 // The standard behaviour in the backend for these cases is to split the
3169 // extend up into two parts:
3170 // 1. Perform an extending load or masked load up to the legal type.
3171 // 2. Extend the loaded data to the final type.
3172 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
3173 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
3175 Opcode, LegalTy, Src, CCH, CostKind, I);
3177 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
3178 return Part1 + Part2;
3179 }
3180
3181 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3182 // but we also want to include the TTI::CastContextHint::Masked case too.
3183 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
3185 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
3187
3188 return AdjustCost(
3189 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3190}
3191
3193 Type *Dst,
3194 VectorType *VecTy,
3195 unsigned Index) {
3196
3197 // Make sure we were given a valid extend opcode.
3198 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
3199 "Invalid opcode");
3200
3201 // We are extending an element we extract from a vector, so the source type
3202 // of the extend is the element type of the vector.
3203 auto *Src = VecTy->getElementType();
3204
3205 // Sign- and zero-extends are for integer types only.
3206 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3207
3208 // Get the cost for the extract. We compute the cost (if any) for the extend
3209 // below.
3211 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
3212 CostKind, Index, nullptr, nullptr);
3213
3214 // Legalize the types.
3215 auto VecLT = getTypeLegalizationCost(VecTy);
3216 auto DstVT = TLI->getValueType(DL, Dst);
3217 auto SrcVT = TLI->getValueType(DL, Src);
3218
3219 // If the resulting type is still a vector and the destination type is legal,
3220 // we may get the extension for free. If not, get the default cost for the
3221 // extend.
3222 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
3223 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3224 CostKind);
3225
3226 // The destination type should be larger than the element type. If not, get
3227 // the default cost for the extend.
3228 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3229 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3230 CostKind);
3231
3232 switch (Opcode) {
3233 default:
3234 llvm_unreachable("Opcode should be either SExt or ZExt");
3235
3236 // For sign-extends, we only need a smov, which performs the extension
3237 // automatically.
3238 case Instruction::SExt:
3239 return Cost;
3240
3241 // For zero-extends, the extend is performed automatically by a umov unless
3242 // the destination type is i64 and the element type is i8 or i16.
3243 case Instruction::ZExt:
3244 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
3245 return Cost;
3246 }
3247
3248 // If we are unable to perform the extend for free, get the default cost.
3249 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
3250 CostKind);
3251}
3252
3255 const Instruction *I) {
3257 return Opcode == Instruction::PHI ? 0 : 1;
3258 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3259 // Branches are assumed to be predicted.
3260 return 0;
3261}
3262
3263InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3264 unsigned Opcode, Type *Val, unsigned Index, bool HasRealUse,
3265 const Instruction *I, Value *Scalar,
3266 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3267 assert(Val->isVectorTy() && "This must be a vector type");
3268
3269 if (Index != -1U) {
3270 // Legalize the type.
3271 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3272
3273 // This type is legalized to a scalar type.
3274 if (!LT.second.isVector())
3275 return 0;
3276
3277 // The type may be split. For fixed-width vectors we can normalize the
3278 // index to the new type.
3279 if (LT.second.isFixedLengthVector()) {
3280 unsigned Width = LT.second.getVectorNumElements();
3281 Index = Index % Width;
3282 }
3283
3284 // The element at index zero is already inside the vector.
3285 // - For a physical (HasRealUse==true) insert-element or extract-element
3286 // instruction that extracts integers, an explicit FPR -> GPR move is
3287 // needed. So it has non-zero cost.
3288 // - For the rest of cases (virtual instruction or element type is float),
3289 // consider the instruction free.
3290 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3291 return 0;
3292
3293 // This is recognising a LD1 single-element structure to one lane of one
3294 // register instruction. I.e., if this is an `insertelement` instruction,
3295 // and its second operand is a load, then we will generate a LD1, which
3296 // are expensive instructions.
3297 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3298 return ST->getVectorInsertExtractBaseCost() + 1;
3299
3300 // i1 inserts and extract will include an extra cset or cmp of the vector
3301 // value. Increase the cost by 1 to account.
3302 if (Val->getScalarSizeInBits() == 1)
3303 return ST->getVectorInsertExtractBaseCost() + 1;
3304
3305 // FIXME:
3306 // If the extract-element and insert-element instructions could be
3307 // simplified away (e.g., could be combined into users by looking at use-def
3308 // context), they have no cost. This is not done in the first place for
3309 // compile-time considerations.
3310 }
3311
3312 // In case of Neon, if there exists extractelement from lane != 0 such that
3313 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
3314 // 2. extractelement result feeds into fmul.
3315 // 3. Other operand of fmul is an extractelement from lane 0 or lane
3316 // equivalent to 0.
3317 // then the extractelement can be merged with fmul in the backend and it
3318 // incurs no cost.
3319 // e.g.
3320 // define double @foo(<2 x double> %a) {
3321 // %1 = extractelement <2 x double> %a, i32 0
3322 // %2 = extractelement <2 x double> %a, i32 1
3323 // %res = fmul double %1, %2
3324 // ret double %res
3325 // }
3326 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3327 auto ExtractCanFuseWithFmul = [&]() {
3328 // We bail out if the extract is from lane 0.
3329 if (Index == 0)
3330 return false;
3331
3332 // Check if the scalar element type of the vector operand of ExtractElement
3333 // instruction is one of the allowed types.
3334 auto IsAllowedScalarTy = [&](const Type *T) {
3335 return T->isFloatTy() || T->isDoubleTy() ||
3336 (T->isHalfTy() && ST->hasFullFP16());
3337 };
3338
3339 // Check if the extractelement user is scalar fmul.
3340 auto IsUserFMulScalarTy = [](const Value *EEUser) {
3341 // Check if the user is scalar fmul.
3342 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
3343 return BO && BO->getOpcode() == BinaryOperator::FMul &&
3344 !BO->getType()->isVectorTy();
3345 };
3346
3347 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
3348 // certain scalar type and a certain vector register width.
3349 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3350 auto RegWidth =
3352 .getFixedValue();
3353 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
3354 };
3355
3356 // Check if the type constraints on input vector type and result scalar type
3357 // of extractelement instruction are satisfied.
3358 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
3359 return false;
3360
3361 if (Scalar) {
3362 DenseMap<User *, unsigned> UserToExtractIdx;
3363 for (auto *U : Scalar->users()) {
3364 if (!IsUserFMulScalarTy(U))
3365 return false;
3366 // Recording entry for the user is important. Index value is not
3367 // important.
3368 UserToExtractIdx[U];
3369 }
3370 if (UserToExtractIdx.empty())
3371 return false;
3372 for (auto &[S, U, L] : ScalarUserAndIdx) {
3373 for (auto *U : S->users()) {
3374 if (UserToExtractIdx.find(U) != UserToExtractIdx.end()) {
3375 auto *FMul = cast<BinaryOperator>(U);
3376 auto *Op0 = FMul->getOperand(0);
3377 auto *Op1 = FMul->getOperand(1);
3378 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
3379 UserToExtractIdx[U] = L;
3380 break;
3381 }
3382 }
3383 }
3384 }
3385 for (auto &[U, L] : UserToExtractIdx) {
3386 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3387 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3388 return false;
3389 }
3390 } else {
3391 const auto *EE = cast<ExtractElementInst>(I);
3392
3393 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
3394 if (!IdxOp)
3395 return false;
3396
3397 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
3398 if (!IsUserFMulScalarTy(U))
3399 return false;
3400
3401 // Check if the other operand of extractelement is also extractelement
3402 // from lane equivalent to 0.
3403 const auto *BO = cast<BinaryOperator>(U);
3404 const auto *OtherEE = dyn_cast<ExtractElementInst>(
3405 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
3406 if (OtherEE) {
3407 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
3408 if (!IdxOp)
3409 return false;
3410 return IsExtractLaneEquivalentToZero(
3411 cast<ConstantInt>(OtherEE->getIndexOperand())
3412 ->getValue()
3413 .getZExtValue(),
3414 OtherEE->getType()->getScalarSizeInBits());
3415 }
3416 return true;
3417 });
3418 }
3419 return true;
3420 };
3421
3422 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
3423 ExtractCanFuseWithFmul())
3424 return 0;
3425
3426 // All other insert/extracts cost this much.
3427 return ST->getVectorInsertExtractBaseCost();
3428}
3429
3432 unsigned Index, Value *Op0,
3433 Value *Op1) {
3434 bool HasRealUse =
3435 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3436 return getVectorInstrCostHelper(Opcode, Val, Index, HasRealUse);
3437}
3438
3440 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3441 Value *Scalar,
3442 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
3443 return getVectorInstrCostHelper(Opcode, Val, Index, false, nullptr, Scalar,
3444 ScalarUserAndIdx);
3445}
3446
3448 Type *Val,
3450 unsigned Index) {
3451 return getVectorInstrCostHelper(I.getOpcode(), Val, Index,
3452 true /* HasRealUse */, &I);
3453}
3454
3456 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3458 if (isa<ScalableVectorType>(Ty))
3460 if (Ty->getElementType()->isFloatingPointTy())
3461 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3462 CostKind);
3463 return DemandedElts.popcount() * (Insert + Extract) *
3465}
3466
3468 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3471 const Instruction *CxtI) {
3472
3473 // The code-generator is currently not able to handle scalable vectors
3474 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3475 // it. This change will be removed when code-generation for these types is
3476 // sufficiently reliable.
3477 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3478 if (VTy->getElementCount() == ElementCount::getScalable(1))
3480
3481 // TODO: Handle more cost kinds.
3483 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3484 Op2Info, Args, CxtI);
3485
3486 // Legalize the type.
3487 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3488 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3489
3490 switch (ISD) {
3491 default:
3492 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3493 Op2Info);
3494 case ISD::SDIV:
3495 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3496 // On AArch64, scalar signed division by constants power-of-two are
3497 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3498 // The OperandValue properties many not be same as that of previous
3499 // operation; conservatively assume OP_None.
3501 Instruction::Add, Ty, CostKind,
3502 Op1Info.getNoProps(), Op2Info.getNoProps());
3503 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3504 Op1Info.getNoProps(), Op2Info.getNoProps());
3506 Instruction::Select, Ty, CostKind,
3507 Op1Info.getNoProps(), Op2Info.getNoProps());
3508 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3509 Op1Info.getNoProps(), Op2Info.getNoProps());
3510 return Cost;
3511 }
3512 [[fallthrough]];
3513 case ISD::UDIV: {
3514 auto VT = TLI->getValueType(DL, Ty);
3515 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3516 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3517 // Vector signed division by constant are expanded to the
3518 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3519 // to MULHS + SUB + SRL + ADD + SRL.
3521 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3523 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3525 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3526 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3527 }
3528 }
3529
3530 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
3531 // emitted by the backend even when those functions are not declared in the
3532 // module.
3533 if (!VT.isVector() && VT.getSizeInBits() > 64)
3534 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3535
3537 Opcode, Ty, CostKind, Op1Info, Op2Info);
3538 if (Ty->isVectorTy()) {
3539 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3540 // SDIV/UDIV operations are lowered using SVE, then we can have less
3541 // costs.
3542 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3543 ->getPrimitiveSizeInBits()
3544 .getFixedValue() < 128) {
3545 EVT VT = TLI->getValueType(DL, Ty);
3546 static const CostTblEntry DivTbl[]{
3547 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3548 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3549 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3550 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3551 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3552 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3553
3554 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3555 if (nullptr != Entry)
3556 return Entry->Cost;
3557 }
3558 // For 8/16-bit elements, the cost is higher because the type
3559 // requires promotion and possibly splitting:
3560 if (LT.second.getScalarType() == MVT::i8)
3561 Cost *= 8;
3562 else if (LT.second.getScalarType() == MVT::i16)
3563 Cost *= 4;
3564 return Cost;
3565 } else {
3566 // If one of the operands is a uniform constant then the cost for each
3567 // element is Cost for insertion, extraction and division.
3568 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3569 // operation with scalar type
3570 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3571 (Op2Info.isConstant() && Op2Info.isUniform())) {
3572 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3574 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3575 return (4 + DivCost) * VTy->getNumElements();
3576 }
3577 }
3578 // On AArch64, without SVE, vector divisions are expanded
3579 // into scalar divisions of each pair of elements.
3580 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3581 CostKind, Op1Info, Op2Info);
3582 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3583 Op1Info, Op2Info);
3584 }
3585
3586 // TODO: if one of the arguments is scalar, then it's not necessary to
3587 // double the cost of handling the vector elements.
3588 Cost += Cost;
3589 }
3590 return Cost;
3591 }
3592 case ISD::MUL:
3593 // When SVE is available, then we can lower the v2i64 operation using
3594 // the SVE mul instruction, which has a lower cost.
3595 if (LT.second == MVT::v2i64 && ST->hasSVE())
3596 return LT.first;
3597
3598 // When SVE is not available, there is no MUL.2d instruction,
3599 // which means mul <2 x i64> is expensive as elements are extracted
3600 // from the vectors and the muls scalarized.
3601 // As getScalarizationOverhead is a bit too pessimistic, we
3602 // estimate the cost for a i64 vector directly here, which is:
3603 // - four 2-cost i64 extracts,
3604 // - two 2-cost i64 inserts, and
3605 // - two 1-cost muls.
3606 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3607 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3608 // need to scalarize so the cost can be cheaper (smull or umull).
3609 // so the cost can be cheaper (smull or umull).
3610 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3611 return LT.first;
3612 return LT.first * 14;
3613 case ISD::ADD:
3614 case ISD::XOR:
3615 case ISD::OR:
3616 case ISD::AND:
3617 case ISD::SRL:
3618 case ISD::SRA:
3619 case ISD::SHL:
3620 // These nodes are marked as 'custom' for combining purposes only.
3621 // We know that they are legal. See LowerAdd in ISelLowering.
3622 return LT.first;
3623
3624 case ISD::FNEG:
3625 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3626 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3627 (Ty->isHalfTy() && ST->hasFullFP16())) &&
3628 CxtI &&
3629 ((CxtI->hasOneUse() &&
3630 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3631 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3632 return 0;
3633 [[fallthrough]];
3634 case ISD::FADD:
3635 case ISD::FSUB:
3636 // Increase the cost for half and bfloat types if not architecturally
3637 // supported.
3638 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3639 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3640 return 2 * LT.first;
3641 if (!Ty->getScalarType()->isFP128Ty())
3642 return LT.first;
3643 [[fallthrough]];
3644 case ISD::FMUL:
3645 case ISD::FDIV:
3646 // These nodes are marked as 'custom' just to lower them to SVE.
3647 // We know said lowering will incur no additional cost.
3648 if (!Ty->getScalarType()->isFP128Ty())
3649 return 2 * LT.first;
3650
3651 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3652 Op2Info);
3653 case ISD::FREM:
3654 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3655 // those functions are not declared in the module.
3656 if (!Ty->isVectorTy())
3657 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3658 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3659 Op2Info);
3660 }
3661}
3662
3664 ScalarEvolution *SE,
3665 const SCEV *Ptr) {
3666 // Address computations in vectorized code with non-consecutive addresses will
3667 // likely result in more instructions compared to scalar code where the
3668 // computation can more often be merged into the index mode. The resulting
3669 // extra micro-ops can significantly decrease throughput.
3670 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3671 int MaxMergeDistance = 64;
3672
3673 if (Ty->isVectorTy() && SE &&
3674 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3675 return NumVectorInstToHideOverhead;
3676
3677 // In many cases the address computation is not merged into the instruction
3678 // addressing mode.
3679 return 1;
3680}
3681
3683 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
3685 TTI::OperandValueInfo Op2Info, const Instruction *I) {
3686 // TODO: Handle other cost kinds.
3688 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3689 Op1Info, Op2Info, I);
3690
3691 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3692 // We don't lower some vector selects well that are wider than the register
3693 // width.
3694 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3695 // We would need this many instructions to hide the scalarization happening.
3696 const int AmortizationCost = 20;
3697
3698 // If VecPred is not set, check if we can get a predicate from the context
3699 // instruction, if its type matches the requested ValTy.
3700 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3701 CmpPredicate CurrentPred;
3702 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3703 m_Value())))
3704 VecPred = CurrentPred;
3705 }
3706 // Check if we have a compare/select chain that can be lowered using
3707 // a (F)CMxx & BFI pair.
3708 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3709 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3710 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3711 VecPred == CmpInst::FCMP_UNE) {
3712 static const auto ValidMinMaxTys = {
3713 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3714 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3715 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3716
3717 auto LT = getTypeLegalizationCost(ValTy);
3718 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3719 (ST->hasFullFP16() &&
3720 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3721 return LT.first;
3722 }
3723
3724 static const TypeConversionCostTblEntry
3725 VectorSelectTbl[] = {
3726 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3727 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3728 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3729 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3730 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3731 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3732 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3733 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3734 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3735 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3736 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3737 };
3738
3739 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3740 EVT SelValTy = TLI->getValueType(DL, ValTy);
3741 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3742 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3743 SelCondTy.getSimpleVT(),
3744 SelValTy.getSimpleVT()))
3745 return Entry->Cost;
3746 }
3747 }
3748
3749 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3750 auto LT = getTypeLegalizationCost(ValTy);
3751 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3752 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3753 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3754 }
3755
3756 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3757 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3758 // be profitable.
3759 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3760 ICmpInst::isEquality(VecPred) &&
3761 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3762 match(I->getOperand(1), m_Zero()) &&
3763 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3764 return 0;
3765
3766 // The base case handles scalable vectors fine for now, since it treats the
3767 // cost as 1 * legalization cost.
3768 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3769 Op1Info, Op2Info, I);
3770}
3771
3773AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3775 if (ST->requiresStrictAlign()) {
3776 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3777 // a bunch of instructions when strict align is enabled.
3778 return Options;
3779 }
3780 Options.AllowOverlappingLoads = true;
3781 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3782 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3783 // TODO: Though vector loads usually perform well on AArch64, in some targets
3784 // they may wake up the FP unit, which raises the power consumption. Perhaps
3785 // they could be used with no holds barred (-O3).
3786 Options.LoadSizes = {8, 4, 2, 1};
3787 Options.AllowedTailExpansions = {3, 5, 6};
3788 return Options;
3789}
3790
3792 return ST->hasSVE();
3793}
3794
3797 Align Alignment, unsigned AddressSpace,
3799 if (useNeonVector(Src))
3800 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3801 CostKind);
3802 auto LT = getTypeLegalizationCost(Src);
3803 if (!LT.first.isValid())
3805
3806 // Return an invalid cost for element types that we are unable to lower.
3807 auto *VT = cast<VectorType>(Src);
3808 if (VT->getElementType()->isIntegerTy(1))
3810
3811 // The code-generator is currently not able to handle scalable vectors
3812 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3813 // it. This change will be removed when code-generation for these types is
3814 // sufficiently reliable.
3815 if (VT->getElementCount() == ElementCount::getScalable(1))
3817
3818 return LT.first;
3819}
3820
3821// This function returns gather/scatter overhead either from
3822// user-provided value or specialized values per-target from \p ST.
3823static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3824 const AArch64Subtarget *ST) {
3825 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3826 "Should be called on only load or stores.");
3827 switch (Opcode) {
3828 case Instruction::Load:
3829 if (SVEGatherOverhead.getNumOccurrences() > 0)
3830 return SVEGatherOverhead;
3831 return ST->getGatherOverhead();
3832 break;
3833 case Instruction::Store:
3834 if (SVEScatterOverhead.getNumOccurrences() > 0)
3835 return SVEScatterOverhead;
3836 return ST->getScatterOverhead();
3837 break;
3838 default:
3839 llvm_unreachable("Shouldn't have reached here");
3840 }
3841}
3842
3844 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3845 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3846 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3847 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3848 Alignment, CostKind, I);
3849 auto *VT = cast<VectorType>(DataTy);
3850 auto LT = getTypeLegalizationCost(DataTy);
3851 if (!LT.first.isValid())
3853
3854 // Return an invalid cost for element types that we are unable to lower.
3855 if (!LT.second.isVector() ||
3856 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3857 VT->getElementType()->isIntegerTy(1))
3859
3860 // The code-generator is currently not able to handle scalable vectors
3861 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3862 // it. This change will be removed when code-generation for these types is
3863 // sufficiently reliable.
3864 if (VT->getElementCount() == ElementCount::getScalable(1))
3866
3867 ElementCount LegalVF = LT.second.getVectorElementCount();
3868 InstructionCost MemOpCost =
3869 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3870 {TTI::OK_AnyValue, TTI::OP_None}, I);
3871 // Add on an overhead cost for using gathers/scatters.
3872 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3873 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3874}
3875
3877 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3878}
3879
3881 MaybeAlign Alignment,
3882 unsigned AddressSpace,
3884 TTI::OperandValueInfo OpInfo,
3885 const Instruction *I) {
3886 EVT VT = TLI->getValueType(DL, Ty, true);
3887 // Type legalization can't handle structs
3888 if (VT == MVT::Other)
3889 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3890 CostKind);
3891
3892 auto LT = getTypeLegalizationCost(Ty);
3893 if (!LT.first.isValid())
3895
3896 // The code-generator is currently not able to handle scalable vectors
3897 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3898 // it. This change will be removed when code-generation for these types is
3899 // sufficiently reliable.
3900 // We also only support full register predicate loads and stores.
3901 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3902 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3903 (VTy->getElementType()->isIntegerTy(1) &&
3904 !VTy->getElementCount().isKnownMultipleOf(
3907
3908 // TODO: consider latency as well for TCK_SizeAndLatency.
3910 return LT.first;
3911
3913 return 1;
3914
3915 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3916 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3917 // Unaligned stores are extremely inefficient. We don't split all
3918 // unaligned 128-bit stores because the negative impact that has shown in
3919 // practice on inlined block copy code.
3920 // We make such stores expensive so that we will only vectorize if there
3921 // are 6 other instructions getting vectorized.
3922 const int AmortizationCost = 6;
3923
3924 return LT.first * 2 * AmortizationCost;
3925 }
3926
3927 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3928 if (Ty->isPtrOrPtrVectorTy())
3929 return LT.first;
3930
3931 if (useNeonVector(Ty)) {
3932 // Check truncating stores and extending loads.
3933 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3934 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3935 if (VT == MVT::v4i8)
3936 return 2;
3937 // Otherwise we need to scalarize.
3938 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3939 }
3940 EVT EltVT = VT.getVectorElementType();
3941 unsigned EltSize = EltVT.getScalarSizeInBits();
3942 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3943 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3944 *Alignment != Align(1))
3945 return LT.first;
3946 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3947 // widening to v4i8, which produces suboptimal results.
3948 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3949 return LT.first;
3950
3951 // Check non-power-of-2 loads/stores for legal vector element types with
3952 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3953 // operations on smaller power-of-2 ops, including ld1/st1.
3954 LLVMContext &C = Ty->getContext();
3956 SmallVector<EVT> TypeWorklist;
3957 TypeWorklist.push_back(VT);
3958 while (!TypeWorklist.empty()) {
3959 EVT CurrVT = TypeWorklist.pop_back_val();
3960 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3961 if (isPowerOf2_32(CurrNumElements)) {
3962 Cost += 1;
3963 continue;
3964 }
3965
3966 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3967 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3968 TypeWorklist.push_back(
3969 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3970 }
3971 return Cost;
3972 }
3973
3974 return LT.first;
3975}
3976
3978 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3979 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3980 bool UseMaskForCond, bool UseMaskForGaps) {
3981 assert(Factor >= 2 && "Invalid interleave factor");
3982 auto *VecVTy = cast<VectorType>(VecTy);
3983
3984 if (VecTy->isScalableTy() && !ST->hasSVE())
3986
3987 // Vectorization for masked interleaved accesses is only enabled for scalable
3988 // VF.
3989 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3991
3992 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3993 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3994 auto *SubVecTy =
3995 VectorType::get(VecVTy->getElementType(),
3996 VecVTy->getElementCount().divideCoefficientBy(Factor));
3997
3998 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3999 // Accesses having vector types that are a multiple of 128 bits can be
4000 // matched to more than one ldN/stN instruction.
4001 bool UseScalable;
4002 if (MinElts % Factor == 0 &&
4003 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4004 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4005 }
4006
4007 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4008 Alignment, AddressSpace, CostKind,
4009 UseMaskForCond, UseMaskForGaps);
4010}
4011
4016 for (auto *I : Tys) {
4017 if (!I->isVectorTy())
4018 continue;
4019 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
4020 128)
4021 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
4022 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
4023 }
4024 return Cost;
4025}
4026
4028 return ST->getMaxInterleaveFactor();
4029}
4030
4031// For Falkor, we want to avoid having too many strided loads in a loop since
4032// that can exhaust the HW prefetcher resources. We adjust the unroller
4033// MaxCount preference below to attempt to ensure unrolling doesn't create too
4034// many strided loads.
4035static void
4038 enum { MaxStridedLoads = 7 };
4039 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4040 int StridedLoads = 0;
4041 // FIXME? We could make this more precise by looking at the CFG and
4042 // e.g. not counting loads in each side of an if-then-else diamond.
4043 for (const auto BB : L->blocks()) {
4044 for (auto &I : *BB) {
4045 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
4046 if (!LMemI)
4047 continue;
4048
4049 Value *PtrValue = LMemI->getPointerOperand();
4050 if (L->isLoopInvariant(PtrValue))
4051 continue;
4052
4053 const SCEV *LSCEV = SE.getSCEV(PtrValue);
4054 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
4055 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
4056 continue;
4057
4058 // FIXME? We could take pairing of unrolled load copies into account
4059 // by looking at the AddRec, but we would probably have to limit this
4060 // to loops with no stores or other memory optimization barriers.
4061 ++StridedLoads;
4062 // We've seen enough strided loads that seeing more won't make a
4063 // difference.
4064 if (StridedLoads > MaxStridedLoads / 2)
4065 return StridedLoads;
4066 }
4067 }
4068 return StridedLoads;
4069 };
4070
4071 int StridedLoads = countStridedLoads(L, SE);
4072 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4073 << " strided loads\n");
4074 // Pick the largest power of 2 unroll count that won't result in too many
4075 // strided loads.
4076 if (StridedLoads) {
4077 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
4078 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4079 << UP.MaxCount << '\n');
4080 }
4081}
4082
4083/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4084/// OOO engine's wide instruction window and various predictors.
4085static void
4089 // Limit loops with structure that is highly likely to benefit from runtime
4090 // unrolling; that is we exclude outer loops, loops with multiple exits and
4091 // many blocks (i.e. likely with complex control flow). Note that the
4092 // heuristics here may be overly conservative and we err on the side of
4093 // avoiding runtime unrolling rather than unroll excessively. They are all
4094 // subject to further refinement.
4095 if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4096 return;
4097
4098 const SCEV *BTC = SE.getBackedgeTakenCount(L);
4099 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4100 (SE.getSmallConstantMaxTripCount(L) > 0 &&
4101 SE.getSmallConstantMaxTripCount(L) <= 32))
4102 return;
4103 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4104 return;
4105
4106 int64_t Size = 0;
4107 for (auto *BB : L->getBlocks()) {
4108 for (auto &I : *BB) {
4109 if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4110 return;
4111 SmallVector<const Value *, 4> Operands(I.operand_values());
4112 Size +=
4114 }
4115 }
4116
4117 // Limit to loops with trip counts that are cheap to expand.
4118 UP.SCEVExpansionBudget = 1;
4119
4120 // Try to unroll small, single block loops, if they have load/store
4121 // dependencies, to expose more parallel memory access streams.
4122 BasicBlock *Header = L->getHeader();
4123 if (Header == L->getLoopLatch()) {
4124 if (Size > 8)
4125 return;
4126
4127 SmallPtrSet<Value *, 8> LoadedValues;
4129 for (auto *BB : L->blocks()) {
4130 for (auto &I : *BB) {
4132 if (!Ptr)
4133 continue;
4134 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4135 if (SE.isLoopInvariant(PtrSCEV, L))
4136 continue;
4137 if (isa<LoadInst>(&I))
4138 LoadedValues.insert(&I);
4139 else
4140 Stores.push_back(cast<StoreInst>(&I));
4141 }
4142 }
4143
4144 // Try to find an unroll count that maximizes the use of the instruction
4145 // window, i.e. trying to fetch as many instructions per cycle as possible.
4146 unsigned MaxInstsPerLine = 16;
4147 unsigned UC = 1;
4148 unsigned BestUC = 1;
4149 unsigned SizeWithBestUC = BestUC * Size;
4150 while (UC <= 8) {
4151 unsigned SizeWithUC = UC * Size;
4152 if (SizeWithUC > 48)
4153 break;
4154 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4155 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4156 BestUC = UC;
4157 SizeWithBestUC = BestUC * Size;
4158 }
4159 UC++;
4160 }
4161
4162 if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4163 return LoadedValues.contains(SI->getOperand(0));
4164 }))
4165 return;
4166
4167 UP.Runtime = true;
4168 UP.DefaultUnrollRuntimeCount = BestUC;
4169 return;
4170 }
4171
4172 // Try to runtime-unroll loops with early-continues depending on loop-varying
4173 // loads; this helps with branch-prediction for the early-continues.
4174 auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4175 auto *Latch = L->getLoopLatch();
4177 if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4178 none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4179 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
4180 return;
4181
4182 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4183 [&](Instruction *I, unsigned Depth) -> bool {
4184 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4185 return false;
4186
4187 if (isa<LoadInst>(I))
4188 return true;
4189
4190 return any_of(I->operands(), [&](Value *V) {
4191 auto *I = dyn_cast<Instruction>(V);
4192 return I && DependsOnLoopLoad(I, Depth + 1);
4193 });
4194 };
4195 CmpPredicate Pred;
4196 Instruction *I;
4197 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4198 m_Value())) &&
4199 DependsOnLoopLoad(I, 0)) {
4200 UP.Runtime = true;
4201 }
4202}
4203
4207 // Enable partial unrolling and runtime unrolling.
4208 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4209
4210 UP.UpperBound = true;
4211
4212 // For inner loop, it is more likely to be a hot one, and the runtime check
4213 // can be promoted out from LICM pass, so the overhead is less, let's try
4214 // a larger threshold to unroll more loops.
4215 if (L->getLoopDepth() > 1)
4216 UP.PartialThreshold *= 2;
4217
4218 // Disable partial & runtime unrolling on -Os.
4220
4221 // Apply subtarget-specific unrolling preferences.
4222 switch (ST->getProcFamily()) {
4223 case AArch64Subtarget::AppleA14:
4224 case AArch64Subtarget::AppleA15:
4225 case AArch64Subtarget::AppleA16:
4226 case AArch64Subtarget::AppleM4:
4227 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4228 break;
4229 case AArch64Subtarget::Falkor:
4232 break;
4233 default:
4234 break;
4235 }
4236
4237 // Scan the loop: don't unroll loops with calls as this could prevent
4238 // inlining. Don't unroll vector loops either, as they don't benefit much from
4239 // unrolling.
4240 for (auto *BB : L->getBlocks()) {
4241 for (auto &I : *BB) {
4242 // Don't unroll vectorised loop.
4243 if (I.getType()->isVectorTy())
4244 return;
4245
4246 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4247 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4248 if (!isLoweredToCall(F))
4249 continue;
4250 }
4251 return;
4252 }
4253 }
4254 }
4255
4256 // Enable runtime unrolling for in-order models
4257 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4258 // checking for that case, we can ensure that the default behaviour is
4259 // unchanged
4261 !ST->getSchedModel().isOutOfOrder()) {
4262 UP.Runtime = true;
4263 UP.Partial = true;
4264 UP.UnrollRemainder = true;
4266
4267 UP.UnrollAndJam = true;
4269 }
4270}
4271
4275}
4276
4278 Type *ExpectedType) {
4279 switch (Inst->getIntrinsicID()) {
4280 default:
4281 return nullptr;
4282 case Intrinsic::aarch64_neon_st2:
4283 case Intrinsic::aarch64_neon_st3:
4284 case Intrinsic::aarch64_neon_st4: {
4285 // Create a struct type
4286 StructType *ST = dyn_cast<StructType>(ExpectedType);
4287 if (!ST)
4288 return nullptr;
4289 unsigned NumElts = Inst->arg_size() - 1;
4290 if (ST->getNumElements() != NumElts)
4291 return nullptr;
4292 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4293 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
4294 return nullptr;
4295 }
4296 Value *Res = PoisonValue::get(ExpectedType);
4297 IRBuilder<> Builder(Inst);
4298 for (unsigned i = 0, e = NumElts; i != e; ++i) {
4299 Value *L = Inst->getArgOperand(i);
4300 Res = Builder.CreateInsertValue(Res, L, i);
4301 }
4302 return Res;
4303 }
4304 case Intrinsic::aarch64_neon_ld2:
4305 case Intrinsic::aarch64_neon_ld3:
4306 case Intrinsic::aarch64_neon_ld4:
4307 if (Inst->getType() == ExpectedType)
4308 return Inst;
4309 return nullptr;
4310 }
4311}
4312
4314 MemIntrinsicInfo &Info) {
4315 switch (Inst->getIntrinsicID()) {
4316 default:
4317 break;
4318 case Intrinsic::aarch64_neon_ld2:
4319 case Intrinsic::aarch64_neon_ld3:
4320 case Intrinsic::aarch64_neon_ld4:
4321 Info.ReadMem = true;
4322 Info.WriteMem = false;
4323 Info.PtrVal = Inst->getArgOperand(0);
4324 break;
4325 case Intrinsic::aarch64_neon_st2:
4326 case Intrinsic::aarch64_neon_st3:
4327 case Intrinsic::aarch64_neon_st4:
4328 Info.ReadMem = false;
4329 Info.WriteMem = true;
4330 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
4331 break;
4332 }
4333
4334 switch (Inst->getIntrinsicID()) {
4335 default:
4336 return false;
4337 case Intrinsic::aarch64_neon_ld2:
4338 case Intrinsic::aarch64_neon_st2:
4339 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
4340 break;
4341 case Intrinsic::aarch64_neon_ld3:
4342 case Intrinsic::aarch64_neon_st3:
4343 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
4344 break;
4345 case Intrinsic::aarch64_neon_ld4:
4346 case Intrinsic::aarch64_neon_st4:
4347 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
4348 break;
4349 }
4350 return true;
4351}
4352
4353/// See if \p I should be considered for address type promotion. We check if \p
4354/// I is a sext with right type and used in memory accesses. If it used in a
4355/// "complex" getelementptr, we allow it to be promoted without finding other
4356/// sext instructions that sign extended the same initial value. A getelementptr
4357/// is considered as "complex" if it has more than 2 operands.
4359 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
4360 bool Considerable = false;
4361 AllowPromotionWithoutCommonHeader = false;
4362 if (!isa<SExtInst>(&I))
4363 return false;
4364 Type *ConsideredSExtType =
4365 Type::getInt64Ty(I.getParent()->getParent()->getContext());
4366 if (I.getType() != ConsideredSExtType)
4367 return false;
4368 // See if the sext is the one with the right type and used in at least one
4369 // GetElementPtrInst.
4370 for (const User *U : I.users()) {
4371 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
4372 Considerable = true;
4373 // A getelementptr is considered as "complex" if it has more than 2
4374 // operands. We will promote a SExt used in such complex GEP as we
4375 // expect some computation to be merged if they are done on 64 bits.
4376 if (GEPInst->getNumOperands() > 2) {
4377 AllowPromotionWithoutCommonHeader = true;
4378 break;
4379 }
4380 }
4381 }
4382 return Considerable;
4383}
4384
4386 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
4387 if (!VF.isScalable())
4388 return true;
4389
4390 Type *Ty = RdxDesc.getRecurrenceType();
4392 return false;
4393
4394 switch (RdxDesc.getRecurrenceKind()) {
4395 case RecurKind::Add:
4396 case RecurKind::FAdd:
4397 case RecurKind::And:
4398 case RecurKind::Or:
4399 case RecurKind::Xor:
4400 case RecurKind::SMin:
4401 case RecurKind::SMax:
4402 case RecurKind::UMin:
4403 case RecurKind::UMax:
4404 case RecurKind::FMin:
4405 case RecurKind::FMax:
4406 case RecurKind::FMulAdd:
4407 case RecurKind::IAnyOf:
4408 case RecurKind::FAnyOf:
4409 return true;
4410 default:
4411 return false;
4412 }
4413}
4414
4417 FastMathFlags FMF,
4419 // The code-generator is currently not able to handle scalable vectors
4420 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4421 // it. This change will be removed when code-generation for these types is
4422 // sufficiently reliable.
4423 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4424 if (VTy->getElementCount() == ElementCount::getScalable(1))
4426
4427 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4428
4429 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
4430 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
4431
4432 InstructionCost LegalizationCost = 0;
4433 if (LT.first > 1) {
4434 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
4435 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
4436 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
4437 }
4438
4439 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
4440}
4441
4443 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
4444 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4445 InstructionCost LegalizationCost = 0;
4446 if (LT.first > 1) {
4447 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
4448 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
4449 LegalizationCost *= LT.first - 1;
4450 }
4451
4452 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4453 assert(ISD && "Invalid opcode");
4454 // Add the final reduction cost for the legal horizontal reduction
4455 switch (ISD) {
4456 case ISD::ADD:
4457 case ISD::AND:
4458 case ISD::OR:
4459 case ISD::XOR:
4460 case ISD::FADD:
4461 return LegalizationCost + 2;
4462 default:
4464 }
4465}
4466
4469 std::optional<FastMathFlags> FMF,
4471 // The code-generator is currently not able to handle scalable vectors
4472 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4473 // it. This change will be removed when code-generation for these types is
4474 // sufficiently reliable.
4475 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
4476 if (VTy->getElementCount() == ElementCount::getScalable(1))
4478
4480 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
4481 InstructionCost BaseCost =
4482 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4483 // Add on extra cost to reflect the extra overhead on some CPUs. We still
4484 // end up vectorizing for more computationally intensive loops.
4485 return BaseCost + FixedVTy->getNumElements();
4486 }
4487
4488 if (Opcode != Instruction::FAdd)
4490
4491 auto *VTy = cast<ScalableVectorType>(ValTy);
4493 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
4494 Cost *= getMaxNumElements(VTy->getElementCount());
4495 return Cost;
4496 }
4497
4498 if (isa<ScalableVectorType>(ValTy))
4499 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
4500
4501 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
4502 MVT MTy = LT.second;
4503 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4504 assert(ISD && "Invalid opcode");
4505
4506 // Horizontal adds can use the 'addv' instruction. We model the cost of these
4507 // instructions as twice a normal vector add, plus 1 for each legalization
4508 // step (LT.first). This is the only arithmetic vector reduction operation for
4509 // which we have an instruction.
4510 // OR, XOR and AND costs should match the codegen from:
4511 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
4512 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4513 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4514 static const CostTblEntry CostTblNoPairwise[]{
4515 {ISD::ADD, MVT::v8i8, 2},
4516 {ISD::ADD, MVT::v16i8, 2},
4517 {ISD::ADD, MVT::v4i16, 2},
4518 {ISD::ADD, MVT::v8i16, 2},
4519 {ISD::ADD, MVT::v4i32, 2},
4520 {ISD::ADD, MVT::v2i64, 2},
4521 {ISD::OR, MVT::v8i8, 15},
4522 {ISD::OR, MVT::v16i8, 17},
4523 {ISD::OR, MVT::v4i16, 7},
4524 {ISD::OR, MVT::v8i16, 9},
4525 {ISD::OR, MVT::v2i32, 3},
4526 {ISD::OR, MVT::v4i32, 5},
4527 {ISD::OR, MVT::v2i64, 3},
4528 {ISD::XOR, MVT::v8i8, 15},
4529 {ISD::XOR, MVT::v16i8, 17},
4530 {ISD::XOR, MVT::v4i16, 7},
4531 {ISD::XOR, MVT::v8i16, 9},
4532 {ISD::XOR, MVT::v2i32, 3},
4533 {ISD::XOR, MVT::v4i32, 5},
4534 {ISD::XOR, MVT::v2i64, 3},
4535 {ISD::AND, MVT::v8i8, 15},
4536 {ISD::AND, MVT::v16i8, 17},
4537 {ISD::AND, MVT::v4i16, 7},
4538 {ISD::AND, MVT::v8i16, 9},
4539 {ISD::AND, MVT::v2i32, 3},
4540 {ISD::AND, MVT::v4i32, 5},
4541 {ISD::AND, MVT::v2i64, 3},
4542 };
4543 switch (ISD) {
4544 default:
4545 break;
4546 case ISD::FADD:
4547 if (Type *EltTy = ValTy->getScalarType();
4548 // FIXME: For half types without fullfp16 support, this could extend and
4549 // use a fp32 faddp reduction but current codegen unrolls.
4550 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
4551 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
4552 const unsigned NElts = MTy.getVectorNumElements();
4553 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
4554 isPowerOf2_32(NElts))
4555 // Reduction corresponding to series of fadd instructions is lowered to
4556 // series of faddp instructions. faddp has latency/throughput that
4557 // matches fadd instruction and hence, every faddp instruction can be
4558 // considered to have a relative cost = 1 with
4559 // CostKind = TCK_RecipThroughput.
4560 // An faddp will pairwise add vector elements, so the size of input
4561 // vector reduces by half every time, requiring
4562 // #(faddp instructions) = log2_32(NElts).
4563 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
4564 }
4565 break;
4566 case ISD::ADD:
4567 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4568 return (LT.first - 1) + Entry->Cost;
4569 break;
4570 case ISD::XOR:
4571 case ISD::AND:
4572 case ISD::OR:
4573 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4574 if (!Entry)
4575 break;
4576 auto *ValVTy = cast<FixedVectorType>(ValTy);
4577 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4578 isPowerOf2_32(ValVTy->getNumElements())) {
4579 InstructionCost ExtraCost = 0;
4580 if (LT.first != 1) {
4581 // Type needs to be split, so there is an extra cost of LT.first - 1
4582 // arithmetic ops.
4583 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4584 MTy.getVectorNumElements());
4585 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4586 ExtraCost *= LT.first - 1;
4587 }
4588 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4589 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4590 return Cost + ExtraCost;
4591 }
4592 break;
4593 }
4594 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4595}
4596
4598 static const CostTblEntry ShuffleTbl[] = {
4599 { TTI::SK_Splice, MVT::nxv16i8, 1 },
4600 { TTI::SK_Splice, MVT::nxv8i16, 1 },
4601 { TTI::SK_Splice, MVT::nxv4i32, 1 },
4602 { TTI::SK_Splice, MVT::nxv2i64, 1 },
4603 { TTI::SK_Splice, MVT::nxv2f16, 1 },
4604 { TTI::SK_Splice, MVT::nxv4f16, 1 },
4605 { TTI::SK_Splice, MVT::nxv8f16, 1 },
4606 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4607 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4608 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4609 { TTI::SK_Splice, MVT::nxv2f32, 1 },
4610 { TTI::SK_Splice, MVT::nxv4f32, 1 },
4611 { TTI::SK_Splice, MVT::nxv2f64, 1 },
4612 };
4613
4614 // The code-generator is currently not able to handle scalable vectors
4615 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4616 // it. This change will be removed when code-generation for these types is
4617 // sufficiently reliable.
4620
4621 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4622 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4624 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4625 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4626 : LT.second;
4627 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4628 InstructionCost LegalizationCost = 0;
4629 if (Index < 0) {
4630 LegalizationCost =
4631 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4633 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4635 }
4636
4637 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4638 // Cost performed on a promoted type.
4639 if (LT.second.getScalarType() == MVT::i1) {
4640 LegalizationCost +=
4641 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4643 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4645 }
4646 const auto *Entry =
4647 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4648 assert(Entry && "Illegal Type for Splice");
4649 LegalizationCost += Entry->Cost;
4650 return LegalizationCost * LT.first;
4651}
4652
4655 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4656 ArrayRef<const Value *> Args, const Instruction *CxtI) {
4657 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4658
4659 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4660 // into smaller vectors and sum the cost of each shuffle.
4661 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4662 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4663 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4664
4665 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4666 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4667 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4668 // cost than just the load.
4669 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4672 return std::max<InstructionCost>(1, LT.first / 4);
4673
4674 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4675 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4676 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4677 // cost than just the store.
4678 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4680 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4682 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4683 return LT.first;
4684
4685 unsigned TpNumElts = Mask.size();
4686 unsigned LTNumElts = LT.second.getVectorNumElements();
4687 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4688 VectorType *NTp =
4689 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4691 for (unsigned N = 0; N < NumVecs; N++) {
4692 SmallVector<int> NMask;
4693 // Split the existing mask into chunks of size LTNumElts. Track the source
4694 // sub-vectors to ensure the result has at most 2 inputs.
4695 unsigned Source1, Source2;
4696 unsigned NumSources = 0;
4697 for (unsigned E = 0; E < LTNumElts; E++) {
4698 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4700 if (MaskElt < 0) {
4702 continue;
4703 }
4704
4705 // Calculate which source from the input this comes from and whether it
4706 // is new to us.
4707 unsigned Source = MaskElt / LTNumElts;
4708 if (NumSources == 0) {
4709 Source1 = Source;
4710 NumSources = 1;
4711 } else if (NumSources == 1 && Source != Source1) {
4712 Source2 = Source;
4713 NumSources = 2;
4714 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4715 NumSources++;
4716 }
4717
4718 // Add to the new mask. For the NumSources>2 case these are not correct,
4719 // but are only used for the modular lane number.
4720 if (Source == Source1)
4721 NMask.push_back(MaskElt % LTNumElts);
4722 else if (Source == Source2)
4723 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4724 else
4725 NMask.push_back(MaskElt % LTNumElts);
4726 }
4727 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4728 // getShuffleCost. If not then cost it using the worst case as the number
4729 // of element moves into a new vector.
4730 if (NumSources <= 2)
4731 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4733 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4734 else
4735 Cost += LTNumElts;
4736 }
4737 return Cost;
4738 }
4739
4740 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4741 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4742 // A subvector extract can be implemented with an ext (or trivial extract, if
4743 // from lane 0). This currently only handles low or high extracts to prevent
4744 // SLP vectorizer regressions.
4745 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
4746 if (LT.second.is128BitVector() &&
4747 cast<FixedVectorType>(SubTp)->getNumElements() ==
4748 LT.second.getVectorNumElements() / 2) {
4749 if (Index == 0)
4750 return 0;
4751 if (Index == (int)LT.second.getVectorNumElements() / 2)
4752 return 1;
4753 }
4755 }
4756
4757 // Check for broadcast loads, which are supported by the LD1R instruction.
4758 // In terms of code-size, the shuffle vector is free when a load + dup get
4759 // folded into a LD1R. That's what we check and return here. For performance
4760 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4761 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4762 // that we model the load + dup sequence slightly higher because LD1R is a
4763 // high latency instruction.
4764 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4765 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4766 if (IsLoad && LT.second.isVector() &&
4768 LT.second.getVectorElementCount()))
4769 return 0;
4770 }
4771
4772 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4773 // from the perfect shuffle tables.
4774 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4775 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4776 all_of(Mask, [](int E) { return E < 8; }))
4777 return getPerfectShuffleCost(Mask);
4778
4779 // Check for identity masks, which we can treat as free.
4780 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4781 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4782 all_of(enumerate(Mask), [](const auto &M) {
4783 return M.value() < 0 || M.value() == (int)M.index();
4784 }))
4785 return 0;
4786
4787 // Check for other shuffles that are not SK_ kinds but we have native
4788 // instructions for, for example ZIP and UZP.
4789 unsigned Unused;
4790 if (LT.second.isFixedLengthVector() &&
4791 LT.second.getVectorNumElements() == Mask.size() &&
4792 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4793 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4794 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4795 // Check for non-zero lane splats
4796 all_of(drop_begin(Mask),
4797 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4798 return 1;
4799
4800 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4801 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4802 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4803 static const CostTblEntry ShuffleTbl[] = {
4804 // Broadcast shuffle kinds can be performed with 'dup'.
4805 {TTI::SK_Broadcast, MVT::v8i8, 1},
4806 {TTI::SK_Broadcast, MVT::v16i8, 1},
4807 {TTI::SK_Broadcast, MVT::v4i16, 1},
4808 {TTI::SK_Broadcast, MVT::v8i16, 1},
4809 {TTI::SK_Broadcast, MVT::v2i32, 1},
4810 {TTI::SK_Broadcast, MVT::v4i32, 1},
4811 {TTI::SK_Broadcast, MVT::v2i64, 1},
4812 {TTI::SK_Broadcast, MVT::v4f16, 1},
4813 {TTI::SK_Broadcast, MVT::v8f16, 1},
4814 {TTI::SK_Broadcast, MVT::v2f32, 1},
4815 {TTI::SK_Broadcast, MVT::v4f32, 1},
4816 {TTI::SK_Broadcast, MVT::v2f64, 1},
4817 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4818 // 'zip1/zip2' instructions.
4819 {TTI::SK_Transpose, MVT::v8i8, 1},
4820 {TTI::SK_Transpose, MVT::v16i8, 1},
4821 {TTI::SK_Transpose, MVT::v4i16, 1},
4822 {TTI::SK_Transpose, MVT::v8i16, 1},
4823 {TTI::SK_Transpose, MVT::v2i32, 1},
4824 {TTI::SK_Transpose, MVT::v4i32, 1},
4825 {TTI::SK_Transpose, MVT::v2i64, 1},
4826 {TTI::SK_Transpose, MVT::v4f16, 1},
4827 {TTI::SK_Transpose, MVT::v8f16, 1},
4828 {TTI::SK_Transpose, MVT::v2f32, 1},
4829 {TTI::SK_Transpose, MVT::v4f32, 1},
4830 {TTI::SK_Transpose, MVT::v2f64, 1},
4831 // Select shuffle kinds.
4832 // TODO: handle vXi8/vXi16.
4833 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4834 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4835 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4836 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4837 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4838 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4839 // PermuteSingleSrc shuffle kinds.
4840 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4841 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4842 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4843 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4844 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4845 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4846 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4847 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4848 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4849 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4850 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4851 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4852 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4853 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4854 // Reverse can be lowered with `rev`.
4855 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4856 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4857 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4858 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4859 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4860 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4861 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4862 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4863 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4864 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4865 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4866 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4867 // Splice can all be lowered as `ext`.
4868 {TTI::SK_Splice, MVT::v2i32, 1},
4869 {TTI::SK_Splice, MVT::v4i32, 1},
4870 {TTI::SK_Splice, MVT::v2i64, 1},
4871 {TTI::SK_Splice, MVT::v2f32, 1},
4872 {TTI::SK_Splice, MVT::v4f32, 1},
4873 {TTI::SK_Splice, MVT::v2f64, 1},
4874 {TTI::SK_Splice, MVT::v8f16, 1},
4875 {TTI::SK_Splice, MVT::v8bf16, 1},
4876 {TTI::SK_Splice, MVT::v8i16, 1},
4877 {TTI::SK_Splice, MVT::v16i8, 1},
4878 {TTI::SK_Splice, MVT::v4bf16, 1},
4879 {TTI::SK_Splice, MVT::v4f16, 1},
4880 {TTI::SK_Splice, MVT::v4i16, 1},
4881 {TTI::SK_Splice, MVT::v8i8, 1},
4882 // Broadcast shuffle kinds for scalable vectors
4883 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4884 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4885 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4886 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4887 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4888 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4889 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4890 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4891 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4892 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4893 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4894 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4895 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4896 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4897 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4898 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4899 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4900 // Handle the cases for vector.reverse with scalable vectors
4901 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4902 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4903 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4904 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4905 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4906 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4907 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4908 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4909 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4910 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4911 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4912 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4913 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4914 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4915 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4916 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4917 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4918 };
4919 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4920 return LT.first * Entry->Cost;
4921 }
4922
4923 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4924 return getSpliceCost(Tp, Index);
4925
4926 // Inserting a subvector can often be done with either a D, S or H register
4927 // move, so long as the inserted vector is "aligned".
4928 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4929 LT.second.getSizeInBits() <= 128 && SubTp) {
4930 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4931 if (SubLT.second.isVector()) {
4932 int NumElts = LT.second.getVectorNumElements();
4933 int NumSubElts = SubLT.second.getVectorNumElements();
4934 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4935 return SubLT.first;
4936 }
4937 }
4938
4939 // Restore optimal kind.
4940 if (IsExtractSubvector)
4942 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4943 CxtI);
4944}
4945
4948 const auto &Strides = DenseMap<Value *, const SCEV *>();
4949 for (BasicBlock *BB : TheLoop->blocks()) {
4950 // Scan the instructions in the block and look for addresses that are
4951 // consecutive and decreasing.
4952 for (Instruction &I : *BB) {
4953 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4955 Type *AccessTy = getLoadStoreType(&I);
4956 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4957 /*ShouldCheckWrap=*/false)
4958 .value_or(0) < 0)
4959 return true;
4960 }
4961 }
4962 }
4963 return false;
4964}
4965
4969 return ST->useFixedOverScalableIfEqualCost();
4970}
4971
4973 return ST->getEpilogueVectorizationMinVF();
4974}
4975
4977 if (!ST->hasSVE())
4978 return false;
4979
4980 // We don't currently support vectorisation with interleaving for SVE - with
4981 // such loops we're better off not using tail-folding. This gives us a chance
4982 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4983 if (TFI->IAI->hasGroups())
4984 return false;
4985
4987 if (TFI->LVL->getReductionVars().size())
4988 Required |= TailFoldingOpts::Reductions;
4989 if (TFI->LVL->getFixedOrderRecurrences().size())
4990 Required |= TailFoldingOpts::Recurrences;
4991
4992 // We call this to discover whether any load/store pointers in the loop have
4993 // negative strides. This will require extra work to reverse the loop
4994 // predicate, which may be expensive.
4997 Required |= TailFoldingOpts::Reverse;
4998 if (Required == TailFoldingOpts::Disabled)
4999 Required |= TailFoldingOpts::Simple;
5000
5002 Required))
5003 return false;
5004
5005 // Don't tail-fold for tight loops where we would be better off interleaving
5006 // with an unpredicated loop.
5007 unsigned NumInsns = 0;
5008 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5009 NumInsns += BB->sizeWithoutDebug();
5010 }
5011
5012 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5013 return NumInsns >= SVETailFoldInsnThreshold;
5014}
5015
5018 StackOffset BaseOffset, bool HasBaseReg,
5019 int64_t Scale, unsigned AddrSpace) const {
5020 // Scaling factors are not free at all.
5021 // Operands | Rt Latency
5022 // -------------------------------------------
5023 // Rt, [Xn, Xm] | 4
5024 // -------------------------------------------
5025 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
5026 // Rt, [Xn, Wm, <extend> #imm] |
5028 AM.BaseGV = BaseGV;
5029 AM.BaseOffs = BaseOffset.getFixed();
5030 AM.HasBaseReg = HasBaseReg;
5031 AM.Scale = Scale;
5032 AM.ScalableOffset = BaseOffset.getScalable();
5033 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
5034 // Scale represents reg2 * scale, thus account for 1 if
5035 // it is not equal to 0 or 1.
5036 return AM.Scale != 0 && AM.Scale != 1;
5037 return -1;
5038}
5039
5042 // For the binary operators (e.g. or) we need to be more careful than
5043 // selects, here we only transform them if they are already at a natural
5044 // break point in the code - the end of a block with an unconditional
5045 // terminator.
5046 if (I->getOpcode() == Instruction::Or &&
5047 isa<BranchInst>(I->getNextNode()) &&
5048 cast<BranchInst>(I->getNextNode())->isUnconditional())
5049 return true;
5050
5051 if (I->getOpcode() == Instruction::Add ||
5052 I->getOpcode() == Instruction::Sub)
5053 return true;
5054 }
5056}
5057
5059 const TargetTransformInfo::LSRCost &C2) {
5060 // AArch64 specific here is adding the number of instructions to the
5061 // comparison (though not as the first consideration, as some targets do)
5062 // along with changing the priority of the base additions.
5063 // TODO: Maybe a more nuanced tradeoff between instruction count
5064 // and number of registers? To be investigated at a later date.
5065 if (EnableLSRCostOpt)
5066 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
5067 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
5068 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
5069 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
5070
5072}
5073
5074static bool isSplatShuffle(Value *V) {
5075 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
5076 return all_equal(Shuf->getShuffleMask());
5077 return false;
5078}
5079
5080/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5081/// or upper half of the vector elements.
5082static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
5083 bool AllowSplat = false) {
5084 // Scalable types can't be extract shuffle vectors.
5085 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
5086 return false;
5087
5088 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
5089 auto *FullTy = FullV->getType();
5090 auto *HalfTy = HalfV->getType();
5091 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5092 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5093 };
5094
5095 auto extractHalf = [](Value *FullV, Value *HalfV) {
5096 auto *FullVT = cast<FixedVectorType>(FullV->getType());
5097 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
5098 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
5099 };
5100
5101 ArrayRef<int> M1, M2;
5102 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
5103 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
5104 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
5105 return false;
5106
5107 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
5108 // it is not checked as an extract below.
5109 if (AllowSplat && isSplatShuffle(Op1))
5110 S1Op1 = nullptr;
5111 if (AllowSplat && isSplatShuffle(Op2))
5112 S2Op1 = nullptr;
5113
5114 // Check that the operands are half as wide as the result and we extract
5115 // half of the elements of the input vectors.
5116 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
5117 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
5118 return false;
5119
5120 // Check the mask extracts either the lower or upper half of vector
5121 // elements.
5122 int M1Start = 0;
5123 int M2Start = 0;
5124 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
5125 if ((S1Op1 &&
5126 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
5127 (S2Op1 &&
5128 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
5129 return false;
5130
5131 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
5132 (M2Start != 0 && M2Start != (NumElements / 2)))
5133 return false;
5134 if (S1Op1 && S2Op1 && M1Start != M2Start)
5135 return false;
5136
5137 return true;
5138}
5139
5140/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
5141/// of the vector elements.
5142static bool areExtractExts(Value *Ext1, Value *Ext2) {
5143 auto areExtDoubled = [](Instruction *Ext) {
5144 return Ext->getType()->getScalarSizeInBits() ==
5145 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
5146 };
5147
5148 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
5149 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
5150 !areExtDoubled(cast<Instruction>(Ext1)) ||
5151 !areExtDoubled(cast<Instruction>(Ext2)))
5152 return false;
5153
5154 return true;
5155}
5156
5157/// Check if Op could be used with vmull_high_p64 intrinsic.
5159 Value *VectorOperand = nullptr;
5160 ConstantInt *ElementIndex = nullptr;
5161 return match(Op, m_ExtractElt(m_Value(VectorOperand),
5162 m_ConstantInt(ElementIndex))) &&
5163 ElementIndex->getValue() == 1 &&
5164 isa<FixedVectorType>(VectorOperand->getType()) &&
5165 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
5166}
5167
5168/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
5169static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
5171}
5172
5174 // Restrict ourselves to the form CodeGenPrepare typically constructs.
5175 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
5176 if (!GEP || GEP->getNumOperands() != 2)
5177 return false;
5178
5179 Value *Base = GEP->getOperand(0);
5180 Value *Offsets = GEP->getOperand(1);
5181
5182 // We only care about scalar_base+vector_offsets.
5183 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
5184 return false;
5185
5186 // Sink extends that would allow us to use 32-bit offset vectors.
5187 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
5188 auto *OffsetsInst = cast<Instruction>(Offsets);
5189 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
5190 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
5191 Ops.push_back(&GEP->getOperandUse(1));
5192 }
5193
5194 // Sink the GEP.
5195 return true;
5196}
5197
5198/// We want to sink following cases:
5199/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
5200/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
5202 if (match(Op, m_VScale()))
5203 return true;
5204 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
5206 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5207 return true;
5208 }
5209 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
5211 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
5212 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
5213 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
5214 return true;
5215 }
5216 return false;
5217}
5218
5219/// Check if sinking \p I's operands to I's basic block is profitable, because
5220/// the operands can be folded into a target instruction, e.g.
5221/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
5223 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5224 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
5225 switch (II->getIntrinsicID()) {
5226 case Intrinsic::aarch64_neon_smull:
5227 case Intrinsic::aarch64_neon_umull:
5228 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
5229 /*AllowSplat=*/true)) {
5230 Ops.push_back(&II->getOperandUse(0));
5231 Ops.push_back(&II->getOperandUse(1));
5232 return true;
5233 }
5234 [[fallthrough]];
5235
5236 case Intrinsic::fma:
5237 case Intrinsic::fmuladd:
5238 if (isa<VectorType>(I->getType()) &&
5239 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5240 !ST->hasFullFP16())
5241 return false;
5242 [[fallthrough]];
5243 case Intrinsic::aarch64_neon_sqdmull:
5244 case Intrinsic::aarch64_neon_sqdmulh:
5245 case Intrinsic::aarch64_neon_sqrdmulh:
5246 // Sink splats for index lane variants
5247 if (isSplatShuffle(II->getOperand(0)))
5248 Ops.push_back(&II->getOperandUse(0));
5249 if (isSplatShuffle(II->getOperand(1)))
5250 Ops.push_back(&II->getOperandUse(1));
5251 return !Ops.empty();
5252 case Intrinsic::aarch64_neon_fmlal:
5253 case Intrinsic::aarch64_neon_fmlal2:
5254 case Intrinsic::aarch64_neon_fmlsl:
5255 case Intrinsic::aarch64_neon_fmlsl2:
5256 // Sink splats for index lane variants
5257 if (isSplatShuffle(II->getOperand(1)))
5258 Ops.push_back(&II->getOperandUse(1));
5259 if (isSplatShuffle(II->getOperand(2)))
5260 Ops.push_back(&II->getOperandUse(2));
5261 return !Ops.empty();
5262 case Intrinsic::aarch64_sve_ptest_first:
5263 case Intrinsic::aarch64_sve_ptest_last:
5264 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
5265 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
5266 Ops.push_back(&II->getOperandUse(0));
5267 return !Ops.empty();
5268 case Intrinsic::aarch64_sme_write_horiz:
5269 case Intrinsic::aarch64_sme_write_vert:
5270 case Intrinsic::aarch64_sme_writeq_horiz:
5271 case Intrinsic::aarch64_sme_writeq_vert: {
5272 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
5273 if (!Idx || Idx->getOpcode() != Instruction::Add)
5274 return false;
5275 Ops.push_back(&II->getOperandUse(1));
5276 return true;
5277 }
5278 case Intrinsic::aarch64_sme_read_horiz:
5279 case Intrinsic::aarch64_sme_read_vert:
5280 case Intrinsic::aarch64_sme_readq_horiz:
5281 case Intrinsic::aarch64_sme_readq_vert:
5282 case Intrinsic::aarch64_sme_ld1b_vert:
5283 case Intrinsic::aarch64_sme_ld1h_vert:
5284 case Intrinsic::aarch64_sme_ld1w_vert:
5285 case Intrinsic::aarch64_sme_ld1d_vert:
5286 case Intrinsic::aarch64_sme_ld1q_vert:
5287 case Intrinsic::aarch64_sme_st1b_vert:
5288 case Intrinsic::aarch64_sme_st1h_vert:
5289 case Intrinsic::aarch64_sme_st1w_vert:
5290 case Intrinsic::aarch64_sme_st1d_vert:
5291 case Intrinsic::aarch64_sme_st1q_vert:
5292 case Intrinsic::aarch64_sme_ld1b_horiz:
5293 case Intrinsic::aarch64_sme_ld1h_horiz:
5294 case Intrinsic::aarch64_sme_ld1w_horiz:
5295 case Intrinsic::aarch64_sme_ld1d_horiz:
5296 case Intrinsic::aarch64_sme_ld1q_horiz:
5297 case Intrinsic::aarch64_sme_st1b_horiz:
5298 case Intrinsic::aarch64_sme_st1h_horiz:
5299 case Intrinsic::aarch64_sme_st1w_horiz:
5300 case Intrinsic::aarch64_sme_st1d_horiz:
5301 case Intrinsic::aarch64_sme_st1q_horiz: {
5302 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
5303 if (!Idx || Idx->getOpcode() != Instruction::Add)
5304 return false;
5305 Ops.push_back(&II->getOperandUse(3));
5306 return true;
5307 }
5308 case Intrinsic::aarch64_neon_pmull:
5309 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
5310 return false;
5311 Ops.push_back(&II->getOperandUse(0));
5312 Ops.push_back(&II->getOperandUse(1));
5313 return true;
5314 case Intrinsic::aarch64_neon_pmull64:
5315 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
5316 II->getArgOperand(1)))
5317 return false;
5318 Ops.push_back(&II->getArgOperandUse(0));
5319 Ops.push_back(&II->getArgOperandUse(1));
5320 return true;
5321 case Intrinsic::masked_gather:
5322 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
5323 return false;
5324 Ops.push_back(&II->getArgOperandUse(0));
5325 return true;
5326 case Intrinsic::masked_scatter:
5327 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
5328 return false;
5329 Ops.push_back(&II->getArgOperandUse(1));
5330 return true;
5331 default:
5332 return false;
5333 }
5334 }
5335
5336 auto ShouldSinkCondition = [](Value *Cond) -> bool {
5337 auto *II = dyn_cast<IntrinsicInst>(Cond);
5338 return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
5339 isa<ScalableVectorType>(II->getOperand(0)->getType());
5340 };
5341
5342 switch (I->getOpcode()) {
5343 case Instruction::GetElementPtr:
5344 case Instruction::Add:
5345 case Instruction::Sub:
5346 // Sink vscales closer to uses for better isel
5347 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
5348 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
5349 Ops.push_back(&I->getOperandUse(Op));
5350 return true;
5351 }
5352 }
5353 break;
5354 case Instruction::Select: {
5355 if (!ShouldSinkCondition(I->getOperand(0)))
5356 return false;
5357
5358 Ops.push_back(&I->getOperandUse(0));
5359 return true;
5360 }
5361 case Instruction::Br: {
5362 if (cast<BranchInst>(I)->isUnconditional())
5363 return false;
5364
5365 if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
5366 return false;
5367
5368 Ops.push_back(&I->getOperandUse(0));
5369 return true;
5370 }
5371 default:
5372 break;
5373 }
5374
5375 if (!I->getType()->isVectorTy())
5376 return false;
5377
5378 switch (I->getOpcode()) {
5379 case Instruction::Sub:
5380 case Instruction::Add: {
5381 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
5382 return false;
5383
5384 // If the exts' operands extract either the lower or upper elements, we
5385 // can sink them too.
5386 auto Ext1 = cast<Instruction>(I->getOperand(0));
5387 auto Ext2 = cast<Instruction>(I->getOperand(1));
5388 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
5389 Ops.push_back(&Ext1->getOperandUse(0));
5390 Ops.push_back(&Ext2->getOperandUse(0));
5391 }
5392
5393 Ops.push_back(&I->getOperandUse(0));
5394 Ops.push_back(&I->getOperandUse(1));
5395
5396 return true;
5397 }
5398 case Instruction::Or: {
5399 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
5400 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
5401 if (ST->hasNEON()) {
5402 Instruction *OtherAnd, *IA, *IB;
5403 Value *MaskValue;
5404 // MainAnd refers to And instruction that has 'Not' as one of its operands
5405 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
5406 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
5407 m_Instruction(IA)))))) {
5408 if (match(OtherAnd,
5409 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
5410 Instruction *MainAnd = I->getOperand(0) == OtherAnd
5411 ? cast<Instruction>(I->getOperand(1))
5412 : cast<Instruction>(I->getOperand(0));
5413
5414 // Both Ands should be in same basic block as Or
5415 if (I->getParent() != MainAnd->getParent() ||
5416 I->getParent() != OtherAnd->getParent())
5417 return false;
5418
5419 // Non-mask operands of both Ands should also be in same basic block
5420 if (I->getParent() != IA->getParent() ||
5421 I->getParent() != IB->getParent())
5422 return false;
5423
5424 Ops.push_back(
5425 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
5426 Ops.push_back(&I->getOperandUse(0));
5427 Ops.push_back(&I->getOperandUse(1));
5428
5429 return true;
5430 }
5431 }
5432 }
5433
5434 return false;
5435 }
5436 case Instruction::Mul: {
5437 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
5438 auto *Ty = cast<VectorType>(V->getType());
5439 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5440 if (Ty->isScalableTy())
5441 return false;
5442
5443 // Indexed variants of Mul exist for i16 and i32 element types only.
5444 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
5445 };
5446
5447 int NumZExts = 0, NumSExts = 0;
5448 for (auto &Op : I->operands()) {
5449 // Make sure we are not already sinking this operand
5450 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
5451 continue;
5452
5453 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
5454 auto *Ext = cast<Instruction>(Op);
5455 auto *ExtOp = Ext->getOperand(0);
5456 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
5457 Ops.push_back(&Ext->getOperandUse(0));
5458 Ops.push_back(&Op);
5459
5460 if (isa<SExtInst>(Ext))
5461 NumSExts++;
5462 else
5463 NumZExts++;
5464
5465 continue;
5466 }
5467
5468 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
5469 if (!Shuffle)
5470 continue;
5471
5472 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
5473 // operand and the s/zext can help create indexed s/umull. This is
5474 // especially useful to prevent i64 mul being scalarized.
5475 if (isSplatShuffle(Shuffle) &&
5476 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
5477 Ops.push_back(&Shuffle->getOperandUse(0));
5478 Ops.push_back(&Op);
5479 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
5480 NumSExts++;
5481 else
5482 NumZExts++;
5483 continue;
5484 }
5485
5486 Value *ShuffleOperand = Shuffle->getOperand(0);
5487 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
5488 if (!Insert)
5489 continue;
5490
5491 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
5492 if (!OperandInstr)
5493 continue;
5494
5495 ConstantInt *ElementConstant =
5496 dyn_cast<ConstantInt>(Insert->getOperand(2));
5497 // Check that the insertelement is inserting into element 0
5498 if (!ElementConstant || !ElementConstant->isZero())
5499 continue;
5500
5501 unsigned Opcode = OperandInstr->getOpcode();
5502 if (Opcode == Instruction::SExt)
5503 NumSExts++;
5504 else if (Opcode == Instruction::ZExt)
5505 NumZExts++;
5506 else {
5507 // If we find that the top bits are known 0, then we can sink and allow
5508 // the backend to generate a umull.
5509 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
5510 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
5511 const DataLayout &DL = I->getDataLayout();
5512 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
5513 continue;
5514 NumZExts++;
5515 }
5516
5517 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
5518 // the And, just to hoist it again back to the load.
5519 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
5520 Ops.push_back(&Insert->getOperandUse(1));
5521 Ops.push_back(&Shuffle->getOperandUse(0));
5522 Ops.push_back(&Op);
5523 }
5524
5525 // It is profitable to sink if we found two of the same type of extends.
5526 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
5527 return true;
5528
5529 // Otherwise, see if we should sink splats for indexed variants.
5530 if (!ShouldSinkSplatForIndexedVariant(I))
5531 return false;
5532
5533 Ops.clear();
5534 if (isSplatShuffle(I->getOperand(0)))
5535 Ops.push_back(&I->getOperandUse(0));
5536 if (isSplatShuffle(I->getOperand(1)))
5537 Ops.push_back(&I->getOperandUse(1));
5538
5539 return !Ops.empty();
5540 }
5541 case Instruction::FMul: {
5542 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5543 if (I->getType()->isScalableTy())
5544 return false;
5545
5546 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5547 !ST->hasFullFP16())
5548 return false;
5549
5550 // Sink splats for index lane variants
5551 if (isSplatShuffle(I->getOperand(0)))
5552 Ops.push_back(&I->getOperandUse(0));
5553 if (isSplatShuffle(I->getOperand(1)))
5554 Ops.push_back(&I->getOperandUse(1));
5555 return !Ops.empty();
5556 }
5557 default:
5558 return false;
5559 }
5560 return false;
5561}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, bool hasInactiveVector)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getEpilogueVectorizationMinVF() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
unsigned getEpilogueVectorizationMinVF() const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1649
unsigned countLeadingOnes() const
Definition: APInt.h:1603
void negate()
Negate this APInt in place.
Definition: APInt.h:1450
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
unsigned logBase2() const
Definition: APInt.h:1739
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:827
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:623
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:307
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:695
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:923
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:807
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:959
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:380
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:218
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:676
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:678
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:680
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
bool isIntPredicate() const
Definition: InstrTypes.h:781
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1672
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:208
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition: IRBuilder.h:92
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2510
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2561
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1079
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2498
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:578
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1153
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:546
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:563
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1873
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:890
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2233
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2434
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1676
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1797
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2532
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1810
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:566
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:573
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2224
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
This instruction inserts a single (scalar) element into a VectorType value.
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:48
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:388
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:412
BuilderTy & Builder
Definition: InstCombiner.h:61
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:694
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:812
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:651
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
This instruction constructs a fixed permutation of two input vectors.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:218
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:241
Value * getOperand(unsigned i) const
Definition: User.h:228
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
int getNumOccurrences() const
Definition: CommandLine.h:399
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:931
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1065
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition: VE.h:376
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:279
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isFixedLengthVector() const
Definition: ValueTypes.h:181
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55