LLVM 20.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62 cl::init(true), cl::Hidden);
63
64// A complete guess as to a reasonable cost.
66 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
67 cl::desc("The cost of a histcnt instruction"));
68
69namespace {
70class TailFoldingOption {
71 // These bitfields will only ever be set to something non-zero in operator=,
72 // when setting the -sve-tail-folding option. This option should always be of
73 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
74 // InitialBits is one of (disabled|all|simple). EnableBits represents
75 // additional flags we're enabling, and DisableBits for those flags we're
76 // disabling. The default flag is tracked in the variable NeedsDefault, since
77 // at the time of setting the option we may not know what the default value
78 // for the CPU is.
79 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
80 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
81 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
82
83 // This value needs to be initialised to true in case the user does not
84 // explicitly set the -sve-tail-folding option.
85 bool NeedsDefault = true;
86
87 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
88
89 void setNeedsDefault(bool V) { NeedsDefault = V; }
90
91 void setEnableBit(TailFoldingOpts Bit) {
92 EnableBits |= Bit;
93 DisableBits &= ~Bit;
94 }
95
96 void setDisableBit(TailFoldingOpts Bit) {
97 EnableBits &= ~Bit;
98 DisableBits |= Bit;
99 }
100
101 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
102 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
103
104 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
105 "Initial bits should only include one of "
106 "(disabled|all|simple|default)");
107 Bits = NeedsDefault ? DefaultBits : InitialBits;
108 Bits |= EnableBits;
109 Bits &= ~DisableBits;
110
111 return Bits;
112 }
113
114 void reportError(std::string Opt) {
115 errs() << "invalid argument '" << Opt
116 << "' to -sve-tail-folding=; the option should be of the form\n"
117 " (disabled|all|default|simple)[+(reductions|recurrences"
118 "|reverse|noreductions|norecurrences|noreverse)]\n";
119 report_fatal_error("Unrecognised tail-folding option");
120 }
121
122public:
123
124 void operator=(const std::string &Val) {
125 // If the user explicitly sets -sve-tail-folding= then treat as an error.
126 if (Val.empty()) {
127 reportError("");
128 return;
129 }
130
131 // Since the user is explicitly setting the option we don't automatically
132 // need the default unless they require it.
133 setNeedsDefault(false);
134
135 SmallVector<StringRef, 4> TailFoldTypes;
136 StringRef(Val).split(TailFoldTypes, '+', -1, false);
137
138 unsigned StartIdx = 1;
139 if (TailFoldTypes[0] == "disabled")
140 setInitialBits(TailFoldingOpts::Disabled);
141 else if (TailFoldTypes[0] == "all")
142 setInitialBits(TailFoldingOpts::All);
143 else if (TailFoldTypes[0] == "default")
144 setNeedsDefault(true);
145 else if (TailFoldTypes[0] == "simple")
146 setInitialBits(TailFoldingOpts::Simple);
147 else {
148 StartIdx = 0;
149 setInitialBits(TailFoldingOpts::Disabled);
150 }
151
152 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
153 if (TailFoldTypes[I] == "reductions")
154 setEnableBit(TailFoldingOpts::Reductions);
155 else if (TailFoldTypes[I] == "recurrences")
156 setEnableBit(TailFoldingOpts::Recurrences);
157 else if (TailFoldTypes[I] == "reverse")
158 setEnableBit(TailFoldingOpts::Reverse);
159 else if (TailFoldTypes[I] == "noreductions")
160 setDisableBit(TailFoldingOpts::Reductions);
161 else if (TailFoldTypes[I] == "norecurrences")
162 setDisableBit(TailFoldingOpts::Recurrences);
163 else if (TailFoldTypes[I] == "noreverse")
164 setDisableBit(TailFoldingOpts::Reverse);
165 else
166 reportError(Val);
167 }
168 }
169
170 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
171 return (getBits(DefaultBits) & Required) == Required;
172 }
173};
174} // namespace
175
176TailFoldingOption TailFoldingOptionLoc;
177
179 "sve-tail-folding",
180 cl::desc(
181 "Control the use of vectorisation using tail-folding for SVE where the"
182 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
183 "\ndisabled (Initial) No loop types will vectorize using "
184 "tail-folding"
185 "\ndefault (Initial) Uses the default tail-folding settings for "
186 "the target CPU"
187 "\nall (Initial) All legal loop types will vectorize using "
188 "tail-folding"
189 "\nsimple (Initial) Use tail-folding for simple loops (not "
190 "reductions or recurrences)"
191 "\nreductions Use tail-folding for loops containing reductions"
192 "\nnoreductions Inverse of above"
193 "\nrecurrences Use tail-folding for loops containing fixed order "
194 "recurrences"
195 "\nnorecurrences Inverse of above"
196 "\nreverse Use tail-folding for loops requiring reversed "
197 "predicates"
198 "\nnoreverse Inverse of above"),
200
201// Experimental option that will only be fully functional when the
202// code-generator is changed to use SVE instead of NEON for all fixed-width
203// operations.
205 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
206
207// Experimental option that will only be fully functional when the cost-model
208// and code-generator have been changed to avoid using scalable vector
209// instructions that are not legal in streaming SVE mode.
211 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
212
213static bool isSMEABIRoutineCall(const CallInst &CI) {
214 const auto *F = CI.getCalledFunction();
215 return F && StringSwitch<bool>(F->getName())
216 .Case("__arm_sme_state", true)
217 .Case("__arm_tpidr2_save", true)
218 .Case("__arm_tpidr2_restore", true)
219 .Case("__arm_za_disable", true)
220 .Default(false);
221}
222
223/// Returns true if the function has explicit operations that can only be
224/// lowered using incompatible instructions for the selected mode. This also
225/// returns true if the function F may use or modify ZA state.
227 for (const BasicBlock &BB : *F) {
228 for (const Instruction &I : BB) {
229 // Be conservative for now and assume that any call to inline asm or to
230 // intrinsics could could result in non-streaming ops (e.g. calls to
231 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
232 // all native LLVM instructions can be lowered to compatible instructions.
233 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
234 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
235 isSMEABIRoutineCall(cast<CallInst>(I))))
236 return true;
237 }
238 }
239 return false;
240}
241
243 const Function *Callee) const {
244 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
245
246 // When inlining, we should consider the body of the function, not the
247 // interface.
248 if (CalleeAttrs.hasStreamingBody()) {
249 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
250 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
251 }
252
253 if (CalleeAttrs.isNewZA())
254 return false;
255
256 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
257 CallerAttrs.requiresSMChange(CalleeAttrs) ||
258 CallerAttrs.requiresPreservingZT0(CalleeAttrs)) {
259 if (hasPossibleIncompatibleOps(Callee))
260 return false;
261 }
262
263 const TargetMachine &TM = getTLI()->getTargetMachine();
264
265 const FeatureBitset &CallerBits =
266 TM.getSubtargetImpl(*Caller)->getFeatureBits();
267 const FeatureBitset &CalleeBits =
268 TM.getSubtargetImpl(*Callee)->getFeatureBits();
269
270 // Inline a callee if its target-features are a subset of the callers
271 // target-features.
272 return (CallerBits & CalleeBits) == CalleeBits;
273}
274
276 const Function *Caller, const Function *Callee,
277 const ArrayRef<Type *> &Types) const {
278 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
279 return false;
280
281 // We need to ensure that argument promotion does not attempt to promote
282 // pointers to fixed-length vector types larger than 128 bits like
283 // <8 x float> (and pointers to aggregate types which have such fixed-length
284 // vector type members) into the values of the pointees. Such vector types
285 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
286 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
287 // types can be safely treated as 128-bit NEON types and they cannot be
288 // distinguished in IR.
289 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
290 auto FVTy = dyn_cast<FixedVectorType>(Ty);
291 return FVTy &&
292 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
293 }))
294 return false;
295
296 return true;
297}
298
299unsigned
301 unsigned DefaultCallPenalty) const {
302 // This function calculates a penalty for executing Call in F.
303 //
304 // There are two ways this function can be called:
305 // (1) F:
306 // call from F -> G (the call here is Call)
307 //
308 // For (1), Call.getCaller() == F, so it will always return a high cost if
309 // a streaming-mode change is required (thus promoting the need to inline the
310 // function)
311 //
312 // (2) F:
313 // call from F -> G (the call here is not Call)
314 // G:
315 // call from G -> H (the call here is Call)
316 //
317 // For (2), if after inlining the body of G into F the call to H requires a
318 // streaming-mode change, and the call to G from F would also require a
319 // streaming-mode change, then there is benefit to do the streaming-mode
320 // change only once and avoid inlining of G into F.
321 SMEAttrs FAttrs(*F);
322 SMEAttrs CalleeAttrs(Call);
323 if (FAttrs.requiresSMChange(CalleeAttrs)) {
324 if (F == Call.getCaller()) // (1)
325 return CallPenaltyChangeSM * DefaultCallPenalty;
326 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
327 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
328 }
329
330 return DefaultCallPenalty;
331}
332
337 ST->isNeonAvailable());
338}
339
340/// Calculate the cost of materializing a 64-bit value. This helper
341/// method might only calculate a fraction of a larger immediate. Therefore it
342/// is valid to return a cost of ZERO.
344 // Check if the immediate can be encoded within an instruction.
345 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
346 return 0;
347
348 if (Val < 0)
349 Val = ~Val;
350
351 // Calculate how many moves we will need to materialize this constant.
354 return Insn.size();
355}
356
357/// Calculate the cost of materializing the given constant.
360 assert(Ty->isIntegerTy());
361
362 unsigned BitSize = Ty->getPrimitiveSizeInBits();
363 if (BitSize == 0)
364 return ~0U;
365
366 // Sign-extend all constants to a multiple of 64-bit.
367 APInt ImmVal = Imm;
368 if (BitSize & 0x3f)
369 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
370
371 // Split the constant into 64-bit chunks and calculate the cost for each
372 // chunk.
374 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
375 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
376 int64_t Val = Tmp.getSExtValue();
377 Cost += getIntImmCost(Val);
378 }
379 // We need at least one instruction to materialze the constant.
380 return std::max<InstructionCost>(1, Cost);
381}
382
384 const APInt &Imm, Type *Ty,
386 Instruction *Inst) {
387 assert(Ty->isIntegerTy());
388
389 unsigned BitSize = Ty->getPrimitiveSizeInBits();
390 // There is no cost model for constants with a bit size of 0. Return TCC_Free
391 // here, so that constant hoisting will ignore this constant.
392 if (BitSize == 0)
393 return TTI::TCC_Free;
394
395 unsigned ImmIdx = ~0U;
396 switch (Opcode) {
397 default:
398 return TTI::TCC_Free;
399 case Instruction::GetElementPtr:
400 // Always hoist the base address of a GetElementPtr.
401 if (Idx == 0)
402 return 2 * TTI::TCC_Basic;
403 return TTI::TCC_Free;
404 case Instruction::Store:
405 ImmIdx = 0;
406 break;
407 case Instruction::Add:
408 case Instruction::Sub:
409 case Instruction::Mul:
410 case Instruction::UDiv:
411 case Instruction::SDiv:
412 case Instruction::URem:
413 case Instruction::SRem:
414 case Instruction::And:
415 case Instruction::Or:
416 case Instruction::Xor:
417 case Instruction::ICmp:
418 ImmIdx = 1;
419 break;
420 // Always return TCC_Free for the shift value of a shift instruction.
421 case Instruction::Shl:
422 case Instruction::LShr:
423 case Instruction::AShr:
424 if (Idx == 1)
425 return TTI::TCC_Free;
426 break;
427 case Instruction::Trunc:
428 case Instruction::ZExt:
429 case Instruction::SExt:
430 case Instruction::IntToPtr:
431 case Instruction::PtrToInt:
432 case Instruction::BitCast:
433 case Instruction::PHI:
434 case Instruction::Call:
435 case Instruction::Select:
436 case Instruction::Ret:
437 case Instruction::Load:
438 break;
439 }
440
441 if (Idx == ImmIdx) {
442 int NumConstants = (BitSize + 63) / 64;
444 return (Cost <= NumConstants * TTI::TCC_Basic)
445 ? static_cast<int>(TTI::TCC_Free)
446 : Cost;
447 }
449}
450
453 const APInt &Imm, Type *Ty,
455 assert(Ty->isIntegerTy());
456
457 unsigned BitSize = Ty->getPrimitiveSizeInBits();
458 // There is no cost model for constants with a bit size of 0. Return TCC_Free
459 // here, so that constant hoisting will ignore this constant.
460 if (BitSize == 0)
461 return TTI::TCC_Free;
462
463 // Most (all?) AArch64 intrinsics do not support folding immediates into the
464 // selected instruction, so we compute the materialization cost for the
465 // immediate directly.
466 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
468
469 switch (IID) {
470 default:
471 return TTI::TCC_Free;
472 case Intrinsic::sadd_with_overflow:
473 case Intrinsic::uadd_with_overflow:
474 case Intrinsic::ssub_with_overflow:
475 case Intrinsic::usub_with_overflow:
476 case Intrinsic::smul_with_overflow:
477 case Intrinsic::umul_with_overflow:
478 if (Idx == 1) {
479 int NumConstants = (BitSize + 63) / 64;
481 return (Cost <= NumConstants * TTI::TCC_Basic)
482 ? static_cast<int>(TTI::TCC_Free)
483 : Cost;
484 }
485 break;
486 case Intrinsic::experimental_stackmap:
487 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
488 return TTI::TCC_Free;
489 break;
490 case Intrinsic::experimental_patchpoint_void:
491 case Intrinsic::experimental_patchpoint:
492 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
493 return TTI::TCC_Free;
494 break;
495 case Intrinsic::experimental_gc_statepoint:
496 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
497 return TTI::TCC_Free;
498 break;
499 }
501}
502
505 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
506 if (TyWidth == 32 || TyWidth == 64)
508 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
509 return TTI::PSK_Software;
510}
511
512static bool isUnpackedVectorVT(EVT VecVT) {
513 return VecVT.isScalableVector() &&
515}
516
518 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
519 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
520
521 // Only allow (32b and 64b) integers or pointers for now...
522 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
523 (EltTy->getScalarSizeInBits() != 32 &&
524 EltTy->getScalarSizeInBits() != 64))
526
527 // FIXME: Hacky check for legal vector types. We can promote smaller types
528 // but we cannot legalize vectors via splitting for histcnt.
529 // FIXME: We should be able to generate histcnt for fixed-length vectors
530 // using ptrue with a specific VL.
531 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
532 if ((VTy->getElementCount().getKnownMinValue() != 2 &&
533 VTy->getElementCount().getKnownMinValue() != 4) ||
534 VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
535 !VTy->isScalableTy())
537
539}
540
544 // The code-generator is currently not able to handle scalable vectors
545 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
546 // it. This change will be removed when code-generation for these types is
547 // sufficiently reliable.
548 auto *RetTy = ICA.getReturnType();
549 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
550 if (VTy->getElementCount() == ElementCount::getScalable(1))
552
553 switch (ICA.getID()) {
554 case Intrinsic::experimental_vector_histogram_add:
555 if (!ST->hasSVE2())
557 return getHistogramCost(ICA);
558 case Intrinsic::umin:
559 case Intrinsic::umax:
560 case Intrinsic::smin:
561 case Intrinsic::smax: {
562 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
563 MVT::v8i16, MVT::v2i32, MVT::v4i32,
564 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
565 MVT::nxv2i64};
567 // v2i64 types get converted to cmp+bif hence the cost of 2
568 if (LT.second == MVT::v2i64)
569 return LT.first * 2;
570 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
571 return LT.first;
572 break;
573 }
574 case Intrinsic::sadd_sat:
575 case Intrinsic::ssub_sat:
576 case Intrinsic::uadd_sat:
577 case Intrinsic::usub_sat: {
578 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
579 MVT::v8i16, MVT::v2i32, MVT::v4i32,
580 MVT::v2i64};
582 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
583 // need to extend the type, as it uses shr(qadd(shl, shl)).
584 unsigned Instrs =
585 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
586 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
587 return LT.first * Instrs;
588 break;
589 }
590 case Intrinsic::abs: {
591 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
592 MVT::v8i16, MVT::v2i32, MVT::v4i32,
593 MVT::v2i64};
595 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
596 return LT.first;
597 break;
598 }
599 case Intrinsic::bswap: {
600 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
601 MVT::v4i32, MVT::v2i64};
603 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
604 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
605 return LT.first;
606 break;
607 }
608 case Intrinsic::experimental_stepvector: {
609 InstructionCost Cost = 1; // Cost of the `index' instruction
611 // Legalisation of illegal vectors involves an `index' instruction plus
612 // (LT.first - 1) vector adds.
613 if (LT.first > 1) {
614 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
615 InstructionCost AddCost =
616 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
617 Cost += AddCost * (LT.first - 1);
618 }
619 return Cost;
620 }
621 case Intrinsic::vector_extract:
622 case Intrinsic::vector_insert: {
623 // If both the vector and subvector types are legal types and the index
624 // is 0, then this should be a no-op or simple operation; return a
625 // relatively low cost.
626
627 // If arguments aren't actually supplied, then we cannot determine the
628 // value of the index. We also want to skip predicate types.
629 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
631 break;
632
633 LLVMContext &C = RetTy->getContext();
634 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
635 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
636 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
637 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
638 // Skip this if either the vector or subvector types are unpacked
639 // SVE types; they may get lowered to stack stores and loads.
640 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
641 break;
642
644 getTLI()->getTypeConversion(C, SubVecVT);
646 getTLI()->getTypeConversion(C, VecVT);
647 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
648 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
649 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
650 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
651 return TTI::TCC_Free;
652 break;
653 }
654 case Intrinsic::bitreverse: {
655 static const CostTblEntry BitreverseTbl[] = {
656 {Intrinsic::bitreverse, MVT::i32, 1},
657 {Intrinsic::bitreverse, MVT::i64, 1},
658 {Intrinsic::bitreverse, MVT::v8i8, 1},
659 {Intrinsic::bitreverse, MVT::v16i8, 1},
660 {Intrinsic::bitreverse, MVT::v4i16, 2},
661 {Intrinsic::bitreverse, MVT::v8i16, 2},
662 {Intrinsic::bitreverse, MVT::v2i32, 2},
663 {Intrinsic::bitreverse, MVT::v4i32, 2},
664 {Intrinsic::bitreverse, MVT::v1i64, 2},
665 {Intrinsic::bitreverse, MVT::v2i64, 2},
666 };
667 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
668 const auto *Entry =
669 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
670 if (Entry) {
671 // Cost Model is using the legal type(i32) that i8 and i16 will be
672 // converted to +1 so that we match the actual lowering cost
673 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
674 TLI->getValueType(DL, RetTy, true) == MVT::i16)
675 return LegalisationCost.first * Entry->Cost + 1;
676
677 return LegalisationCost.first * Entry->Cost;
678 }
679 break;
680 }
681 case Intrinsic::ctpop: {
682 if (!ST->hasNEON()) {
683 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
684 return getTypeLegalizationCost(RetTy).first * 12;
685 }
686 static const CostTblEntry CtpopCostTbl[] = {
687 {ISD::CTPOP, MVT::v2i64, 4},
688 {ISD::CTPOP, MVT::v4i32, 3},
689 {ISD::CTPOP, MVT::v8i16, 2},
690 {ISD::CTPOP, MVT::v16i8, 1},
691 {ISD::CTPOP, MVT::i64, 4},
692 {ISD::CTPOP, MVT::v2i32, 3},
693 {ISD::CTPOP, MVT::v4i16, 2},
694 {ISD::CTPOP, MVT::v8i8, 1},
695 {ISD::CTPOP, MVT::i32, 5},
696 };
698 MVT MTy = LT.second;
699 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
700 // Extra cost of +1 when illegal vector types are legalized by promoting
701 // the integer type.
702 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
703 RetTy->getScalarSizeInBits()
704 ? 1
705 : 0;
706 return LT.first * Entry->Cost + ExtraCost;
707 }
708 break;
709 }
710 case Intrinsic::sadd_with_overflow:
711 case Intrinsic::uadd_with_overflow:
712 case Intrinsic::ssub_with_overflow:
713 case Intrinsic::usub_with_overflow:
714 case Intrinsic::smul_with_overflow:
715 case Intrinsic::umul_with_overflow: {
716 static const CostTblEntry WithOverflowCostTbl[] = {
717 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
718 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
719 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
720 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
721 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
722 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
723 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
724 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
725 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
726 {Intrinsic::usub_with_overflow, MVT::i8, 3},
727 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
728 {Intrinsic::usub_with_overflow, MVT::i16, 3},
729 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
730 {Intrinsic::usub_with_overflow, MVT::i32, 1},
731 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
732 {Intrinsic::usub_with_overflow, MVT::i64, 1},
733 {Intrinsic::smul_with_overflow, MVT::i8, 5},
734 {Intrinsic::umul_with_overflow, MVT::i8, 4},
735 {Intrinsic::smul_with_overflow, MVT::i16, 5},
736 {Intrinsic::umul_with_overflow, MVT::i16, 4},
737 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
738 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
739 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
740 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
741 };
742 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
743 if (MTy.isSimple())
744 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
745 MTy.getSimpleVT()))
746 return Entry->Cost;
747 break;
748 }
749 case Intrinsic::fptosi_sat:
750 case Intrinsic::fptoui_sat: {
751 if (ICA.getArgTypes().empty())
752 break;
753 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
754 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
755 EVT MTy = TLI->getValueType(DL, RetTy);
756 // Check for the legal types, which are where the size of the input and the
757 // output are the same, or we are using cvt f64->i32 or f32->i64.
758 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
759 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
760 LT.second == MVT::v2f64)) {
761 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
762 (LT.second == MVT::f64 && MTy == MVT::i32) ||
763 (LT.second == MVT::f32 && MTy == MVT::i64)))
764 return LT.first;
765 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
766 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
767 MTy.getScalarSizeInBits() == 64)
768 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
769 }
770 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
771 // f32.
772 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
773 return LT.first + getIntrinsicInstrCost(
774 {ICA.getID(),
775 RetTy,
776 {ICA.getArgTypes()[0]->getWithNewType(
777 Type::getFloatTy(RetTy->getContext()))}},
778 CostKind);
779 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
780 (LT.second == MVT::f16 && MTy == MVT::i64) ||
781 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
782 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
783 return LT.first;
784 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
785 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
786 MTy.getScalarSizeInBits() == 32)
787 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
788 // Extending vector types v8f16->v8i32. These current scalarize but the
789 // codegen could be better.
790 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
791 MTy.getScalarSizeInBits() == 64)
792 return MTy.getVectorNumElements() * 3;
793
794 // If we can we use a legal convert followed by a min+max
795 if ((LT.second.getScalarType() == MVT::f32 ||
796 LT.second.getScalarType() == MVT::f64 ||
797 LT.second.getScalarType() == MVT::f16) &&
798 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
799 Type *LegalTy =
800 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
801 if (LT.second.isVector())
802 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
804 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
805 LegalTy, {LegalTy, LegalTy});
807 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
808 LegalTy, {LegalTy, LegalTy});
810 return LT.first * Cost +
811 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
812 : 1);
813 }
814 // Otherwise we need to follow the default expansion that clamps the value
815 // using a float min/max with a fcmp+sel for nan handling when signed.
816 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
817 RetTy = RetTy->getScalarType();
818 if (LT.second.isVector()) {
819 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
820 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
821 }
822 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
824 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
826 Cost +=
827 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
829 if (IsSigned) {
830 Type *CondTy = RetTy->getWithNewBitWidth(1);
831 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
833 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
835 }
836 return LT.first * Cost;
837 }
838 case Intrinsic::fshl:
839 case Intrinsic::fshr: {
840 if (ICA.getArgs().empty())
841 break;
842
843 // TODO: Add handling for fshl where third argument is not a constant.
844 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
845 if (!OpInfoZ.isConstant())
846 break;
847
848 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
849 if (OpInfoZ.isUniform()) {
850 // FIXME: The costs could be lower if the codegen is better.
851 static const CostTblEntry FshlTbl[] = {
852 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
853 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
854 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
855 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
856 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
857 // to avoid having to duplicate the costs.
858 const auto *Entry =
859 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
860 if (Entry)
861 return LegalisationCost.first * Entry->Cost;
862 }
863
864 auto TyL = getTypeLegalizationCost(RetTy);
865 if (!RetTy->isIntegerTy())
866 break;
867
868 // Estimate cost manually, as types like i8 and i16 will get promoted to
869 // i32 and CostTableLookup will ignore the extra conversion cost.
870 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
871 RetTy->getScalarSizeInBits() < 64) ||
872 (RetTy->getScalarSizeInBits() % 64 != 0);
873 unsigned ExtraCost = HigherCost ? 1 : 0;
874 if (RetTy->getScalarSizeInBits() == 32 ||
875 RetTy->getScalarSizeInBits() == 64)
876 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
877 // extr instruction.
878 else if (HigherCost)
879 ExtraCost = 1;
880 else
881 break;
882 return TyL.first + ExtraCost;
883 }
884 case Intrinsic::get_active_lane_mask: {
885 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
886 if (RetTy) {
887 EVT RetVT = getTLI()->getValueType(DL, RetTy);
888 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
889 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
890 !getTLI()->isTypeLegal(RetVT)) {
891 // We don't have enough context at this point to determine if the mask
892 // is going to be kept live after the block, which will force the vXi1
893 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
894 // For now, we just assume the vectorizer created this intrinsic and
895 // the result will be the input for a PHI. In this case the cost will
896 // be extremely high for fixed-width vectors.
897 // NOTE: getScalarizationOverhead returns a cost that's far too
898 // pessimistic for the actual generated codegen. In reality there are
899 // two instructions generated per lane.
900 return RetTy->getNumElements() * 2;
901 }
902 }
903 break;
904 }
905 default:
906 break;
907 }
909}
910
911/// The function will remove redundant reinterprets casting in the presence
912/// of the control flow
913static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
914 IntrinsicInst &II) {
916 auto RequiredType = II.getType();
917
918 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
919 assert(PN && "Expected Phi Node!");
920
921 // Don't create a new Phi unless we can remove the old one.
922 if (!PN->hasOneUse())
923 return std::nullopt;
924
925 for (Value *IncValPhi : PN->incoming_values()) {
926 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
927 if (!Reinterpret ||
928 Reinterpret->getIntrinsicID() !=
929 Intrinsic::aarch64_sve_convert_to_svbool ||
930 RequiredType != Reinterpret->getArgOperand(0)->getType())
931 return std::nullopt;
932 }
933
934 // Create the new Phi
935 IC.Builder.SetInsertPoint(PN);
936 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
937 Worklist.push_back(PN);
938
939 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
940 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
941 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
942 Worklist.push_back(Reinterpret);
943 }
944
945 // Cleanup Phi Node and reinterprets
946 return IC.replaceInstUsesWith(II, NPN);
947}
948
949// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
950// => (binop (pred) (from_svbool _) (from_svbool _))
951//
952// The above transformation eliminates a `to_svbool` in the predicate
953// operand of bitwise operation `binop` by narrowing the vector width of
954// the operation. For example, it would convert a `<vscale x 16 x i1>
955// and` into a `<vscale x 4 x i1> and`. This is profitable because
956// to_svbool must zero the new lanes during widening, whereas
957// from_svbool is free.
958static std::optional<Instruction *>
960 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
961 if (!BinOp)
962 return std::nullopt;
963
964 auto IntrinsicID = BinOp->getIntrinsicID();
965 switch (IntrinsicID) {
966 case Intrinsic::aarch64_sve_and_z:
967 case Intrinsic::aarch64_sve_bic_z:
968 case Intrinsic::aarch64_sve_eor_z:
969 case Intrinsic::aarch64_sve_nand_z:
970 case Intrinsic::aarch64_sve_nor_z:
971 case Intrinsic::aarch64_sve_orn_z:
972 case Intrinsic::aarch64_sve_orr_z:
973 break;
974 default:
975 return std::nullopt;
976 }
977
978 auto BinOpPred = BinOp->getOperand(0);
979 auto BinOpOp1 = BinOp->getOperand(1);
980 auto BinOpOp2 = BinOp->getOperand(2);
981
982 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
983 if (!PredIntr ||
984 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
985 return std::nullopt;
986
987 auto PredOp = PredIntr->getOperand(0);
988 auto PredOpTy = cast<VectorType>(PredOp->getType());
989 if (PredOpTy != II.getType())
990 return std::nullopt;
991
992 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
993 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
994 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
995 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
996 if (BinOpOp1 == BinOpOp2)
997 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
998 else
999 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
1000 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
1001
1002 auto NarrowedBinOp =
1003 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
1004 return IC.replaceInstUsesWith(II, NarrowedBinOp);
1005}
1006
1007static std::optional<Instruction *>
1009 // If the reinterpret instruction operand is a PHI Node
1010 if (isa<PHINode>(II.getArgOperand(0)))
1011 return processPhiNode(IC, II);
1012
1013 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1014 return BinOpCombine;
1015
1016 // Ignore converts to/from svcount_t.
1017 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
1018 isa<TargetExtType>(II.getType()))
1019 return std::nullopt;
1020
1021 SmallVector<Instruction *, 32> CandidatesForRemoval;
1022 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
1023
1024 const auto *IVTy = cast<VectorType>(II.getType());
1025
1026 // Walk the chain of conversions.
1027 while (Cursor) {
1028 // If the type of the cursor has fewer lanes than the final result, zeroing
1029 // must take place, which breaks the equivalence chain.
1030 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
1031 if (CursorVTy->getElementCount().getKnownMinValue() <
1032 IVTy->getElementCount().getKnownMinValue())
1033 break;
1034
1035 // If the cursor has the same type as I, it is a viable replacement.
1036 if (Cursor->getType() == IVTy)
1037 EarliestReplacement = Cursor;
1038
1039 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
1040
1041 // If this is not an SVE conversion intrinsic, this is the end of the chain.
1042 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
1043 Intrinsic::aarch64_sve_convert_to_svbool ||
1044 IntrinsicCursor->getIntrinsicID() ==
1045 Intrinsic::aarch64_sve_convert_from_svbool))
1046 break;
1047
1048 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
1049 Cursor = IntrinsicCursor->getOperand(0);
1050 }
1051
1052 // If no viable replacement in the conversion chain was found, there is
1053 // nothing to do.
1054 if (!EarliestReplacement)
1055 return std::nullopt;
1056
1057 return IC.replaceInstUsesWith(II, EarliestReplacement);
1058}
1059
1060static bool isAllActivePredicate(Value *Pred) {
1061 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1062 Value *UncastedPred;
1063 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1064 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1065 m_Value(UncastedPred)))))
1066 // If the predicate has the same or less lanes than the uncasted
1067 // predicate then we know the casting has no effect.
1068 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1069 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1070 Pred = UncastedPred;
1071
1072 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1073 m_ConstantInt<AArch64SVEPredPattern::all>()));
1074}
1075
1076// Erase unary operation where predicate has all inactive lanes
1077static std::optional<Instruction *>
1079 int PredPos) {
1080 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1081 return IC.eraseInstFromFunction(II);
1082 }
1083 return std::nullopt;
1084}
1085
1086// Simplify unary operation where predicate has all inactive lanes by replacing
1087// instruction with zeroed object
1088static std::optional<Instruction *>
1090 if (match(II.getOperand(0), m_ZeroInt())) {
1091 Constant *Node;
1092 Type *RetTy = II.getType();
1093 if (RetTy->isStructTy()) {
1094 auto StructT = cast<StructType>(RetTy);
1095 auto VecT = StructT->getElementType(0);
1097 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1098 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1099 : ConstantInt::get(VecT, 0));
1100 }
1101 Node = ConstantStruct::get(StructT, ZerVec);
1102 } else if (RetTy->isFPOrFPVectorTy())
1103 Node = ConstantFP::get(RetTy, 0.0);
1104 else
1105 Node = ConstantInt::get(II.getType(), 0);
1106
1108 return IC.eraseInstFromFunction(II);
1109 }
1110 return std::nullopt;
1111}
1112
1113static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1114 IntrinsicInst &II) {
1115 // svsel(ptrue, x, y) => x
1116 auto *OpPredicate = II.getOperand(0);
1117 if (isAllActivePredicate(OpPredicate))
1118 return IC.replaceInstUsesWith(II, II.getOperand(1));
1119
1120 auto Select =
1121 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1122 return IC.replaceInstUsesWith(II, Select);
1123}
1124
1125static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1126 IntrinsicInst &II) {
1127 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1128 if (!Pg)
1129 return std::nullopt;
1130
1131 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1132 return std::nullopt;
1133
1134 const auto PTruePattern =
1135 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1136 if (PTruePattern != AArch64SVEPredPattern::vl1)
1137 return std::nullopt;
1138
1139 // The intrinsic is inserting into lane zero so use an insert instead.
1140 auto *IdxTy = Type::getInt64Ty(II.getContext());
1141 auto *Insert = InsertElementInst::Create(
1142 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1143 Insert->insertBefore(&II);
1144 Insert->takeName(&II);
1145
1146 return IC.replaceInstUsesWith(II, Insert);
1147}
1148
1149static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1150 IntrinsicInst &II) {
1151 // Replace DupX with a regular IR splat.
1152 auto *RetTy = cast<ScalableVectorType>(II.getType());
1153 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1154 II.getArgOperand(0));
1155 Splat->takeName(&II);
1156 return IC.replaceInstUsesWith(II, Splat);
1157}
1158
1159static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1160 IntrinsicInst &II) {
1161 LLVMContext &Ctx = II.getContext();
1162
1163 // Check that the predicate is all active
1164 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1165 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1166 return std::nullopt;
1167
1168 const auto PTruePattern =
1169 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1170 if (PTruePattern != AArch64SVEPredPattern::all)
1171 return std::nullopt;
1172
1173 // Check that we have a compare of zero..
1174 auto *SplatValue =
1175 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1176 if (!SplatValue || !SplatValue->isZero())
1177 return std::nullopt;
1178
1179 // ..against a dupq
1180 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1181 if (!DupQLane ||
1182 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1183 return std::nullopt;
1184
1185 // Where the dupq is a lane 0 replicate of a vector insert
1186 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1187 return std::nullopt;
1188
1189 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1190 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1191 return std::nullopt;
1192
1193 // Where the vector insert is a fixed constant vector insert into undef at
1194 // index zero
1195 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1196 return std::nullopt;
1197
1198 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1199 return std::nullopt;
1200
1201 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1202 if (!ConstVec)
1203 return std::nullopt;
1204
1205 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1206 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1207 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1208 return std::nullopt;
1209
1210 unsigned NumElts = VecTy->getNumElements();
1211 unsigned PredicateBits = 0;
1212
1213 // Expand intrinsic operands to a 16-bit byte level predicate
1214 for (unsigned I = 0; I < NumElts; ++I) {
1215 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1216 if (!Arg)
1217 return std::nullopt;
1218 if (!Arg->isZero())
1219 PredicateBits |= 1 << (I * (16 / NumElts));
1220 }
1221
1222 // If all bits are zero bail early with an empty predicate
1223 if (PredicateBits == 0) {
1224 auto *PFalse = Constant::getNullValue(II.getType());
1225 PFalse->takeName(&II);
1226 return IC.replaceInstUsesWith(II, PFalse);
1227 }
1228
1229 // Calculate largest predicate type used (where byte predicate is largest)
1230 unsigned Mask = 8;
1231 for (unsigned I = 0; I < 16; ++I)
1232 if ((PredicateBits & (1 << I)) != 0)
1233 Mask |= (I % 8);
1234
1235 unsigned PredSize = Mask & -Mask;
1236 auto *PredType = ScalableVectorType::get(
1237 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1238
1239 // Ensure all relevant bits are set
1240 for (unsigned I = 0; I < 16; I += PredSize)
1241 if ((PredicateBits & (1 << I)) == 0)
1242 return std::nullopt;
1243
1244 auto *PTruePat =
1245 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1246 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1247 {PredType}, {PTruePat});
1248 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1249 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1250 auto *ConvertFromSVBool =
1251 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1252 {II.getType()}, {ConvertToSVBool});
1253
1254 ConvertFromSVBool->takeName(&II);
1255 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1256}
1257
1258static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1259 IntrinsicInst &II) {
1260 Value *Pg = II.getArgOperand(0);
1261 Value *Vec = II.getArgOperand(1);
1262 auto IntrinsicID = II.getIntrinsicID();
1263 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1264
1265 // lastX(splat(X)) --> X
1266 if (auto *SplatVal = getSplatValue(Vec))
1267 return IC.replaceInstUsesWith(II, SplatVal);
1268
1269 // If x and/or y is a splat value then:
1270 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1271 Value *LHS, *RHS;
1272 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1273 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1274 auto *OldBinOp = cast<BinaryOperator>(Vec);
1275 auto OpC = OldBinOp->getOpcode();
1276 auto *NewLHS =
1277 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1278 auto *NewRHS =
1279 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1281 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1282 return IC.replaceInstUsesWith(II, NewBinOp);
1283 }
1284 }
1285
1286 auto *C = dyn_cast<Constant>(Pg);
1287 if (IsAfter && C && C->isNullValue()) {
1288 // The intrinsic is extracting lane 0 so use an extract instead.
1289 auto *IdxTy = Type::getInt64Ty(II.getContext());
1290 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1291 Extract->insertBefore(&II);
1292 Extract->takeName(&II);
1293 return IC.replaceInstUsesWith(II, Extract);
1294 }
1295
1296 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1297 if (!IntrPG)
1298 return std::nullopt;
1299
1300 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1301 return std::nullopt;
1302
1303 const auto PTruePattern =
1304 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1305
1306 // Can the intrinsic's predicate be converted to a known constant index?
1307 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1308 if (!MinNumElts)
1309 return std::nullopt;
1310
1311 unsigned Idx = MinNumElts - 1;
1312 // Increment the index if extracting the element after the last active
1313 // predicate element.
1314 if (IsAfter)
1315 ++Idx;
1316
1317 // Ignore extracts whose index is larger than the known minimum vector
1318 // length. NOTE: This is an artificial constraint where we prefer to
1319 // maintain what the user asked for until an alternative is proven faster.
1320 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1321 if (Idx >= PgVTy->getMinNumElements())
1322 return std::nullopt;
1323
1324 // The intrinsic is extracting a fixed lane so use an extract instead.
1325 auto *IdxTy = Type::getInt64Ty(II.getContext());
1326 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1327 Extract->insertBefore(&II);
1328 Extract->takeName(&II);
1329 return IC.replaceInstUsesWith(II, Extract);
1330}
1331
1332static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1333 IntrinsicInst &II) {
1334 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1335 // integer variant across a variety of micro-architectures. Replace scalar
1336 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1337 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1338 // depending on the micro-architecture, but has been observed as generally
1339 // being faster, particularly when the CLAST[AB] op is a loop-carried
1340 // dependency.
1341 Value *Pg = II.getArgOperand(0);
1342 Value *Fallback = II.getArgOperand(1);
1343 Value *Vec = II.getArgOperand(2);
1344 Type *Ty = II.getType();
1345
1346 if (!Ty->isIntegerTy())
1347 return std::nullopt;
1348
1349 Type *FPTy;
1350 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1351 default:
1352 return std::nullopt;
1353 case 16:
1354 FPTy = IC.Builder.getHalfTy();
1355 break;
1356 case 32:
1357 FPTy = IC.Builder.getFloatTy();
1358 break;
1359 case 64:
1360 FPTy = IC.Builder.getDoubleTy();
1361 break;
1362 }
1363
1364 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1365 auto *FPVTy = VectorType::get(
1366 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1367 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1368 auto *FPII = IC.Builder.CreateIntrinsic(
1369 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1370 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1371 return IC.replaceInstUsesWith(II, FPIItoInt);
1372}
1373
1374static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1375 IntrinsicInst &II) {
1376 LLVMContext &Ctx = II.getContext();
1377 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1378 // can work with RDFFR_PP for ptest elimination.
1379 auto *AllPat =
1380 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1381 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1382 {II.getType()}, {AllPat});
1383 auto *RDFFR =
1384 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1385 RDFFR->takeName(&II);
1386 return IC.replaceInstUsesWith(II, RDFFR);
1387}
1388
1389static std::optional<Instruction *>
1391 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1392
1393 if (Pattern == AArch64SVEPredPattern::all) {
1394 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1395 auto *VScale = IC.Builder.CreateVScale(StepVal);
1396 VScale->takeName(&II);
1397 return IC.replaceInstUsesWith(II, VScale);
1398 }
1399
1400 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1401
1402 return MinNumElts && NumElts >= MinNumElts
1403 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1404 II, ConstantInt::get(II.getType(), MinNumElts)))
1405 : std::nullopt;
1406}
1407
1408static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1409 IntrinsicInst &II) {
1410 Value *PgVal = II.getArgOperand(0);
1411 Value *OpVal = II.getArgOperand(1);
1412
1413 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1414 // Later optimizations prefer this form.
1415 if (PgVal == OpVal &&
1416 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1417 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1418 Value *Ops[] = {PgVal, OpVal};
1419 Type *Tys[] = {PgVal->getType()};
1420
1421 auto *PTest =
1422 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1423 PTest->takeName(&II);
1424
1425 return IC.replaceInstUsesWith(II, PTest);
1426 }
1427
1428 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1429 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1430
1431 if (!Pg || !Op)
1432 return std::nullopt;
1433
1434 Intrinsic::ID OpIID = Op->getIntrinsicID();
1435
1436 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1437 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1438 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1439 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1440 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1441
1442 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1443
1444 PTest->takeName(&II);
1445 return IC.replaceInstUsesWith(II, PTest);
1446 }
1447
1448 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1449 // Later optimizations may rewrite sequence to use the flag-setting variant
1450 // of instruction X to remove PTEST.
1451 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1452 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1453 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1454 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1455 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1456 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1457 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1458 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1459 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1460 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1461 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1462 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1463 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1464 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1465 Type *Tys[] = {Pg->getType()};
1466
1467 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1468 PTest->takeName(&II);
1469
1470 return IC.replaceInstUsesWith(II, PTest);
1471 }
1472
1473 return std::nullopt;
1474}
1475
1476template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1477static std::optional<Instruction *>
1479 bool MergeIntoAddendOp) {
1480 Value *P = II.getOperand(0);
1481 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1482 if (MergeIntoAddendOp) {
1483 AddendOp = II.getOperand(1);
1484 Mul = II.getOperand(2);
1485 } else {
1486 AddendOp = II.getOperand(2);
1487 Mul = II.getOperand(1);
1488 }
1489
1490 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1491 m_Value(MulOp1))))
1492 return std::nullopt;
1493
1494 if (!Mul->hasOneUse())
1495 return std::nullopt;
1496
1497 Instruction *FMFSource = nullptr;
1498 if (II.getType()->isFPOrFPVectorTy()) {
1499 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1500 // Stop the combine when the flags on the inputs differ in case dropping
1501 // flags would lead to us missing out on more beneficial optimizations.
1502 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1503 return std::nullopt;
1504 if (!FAddFlags.allowContract())
1505 return std::nullopt;
1506 FMFSource = &II;
1507 }
1508
1509 CallInst *Res;
1510 if (MergeIntoAddendOp)
1511 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1512 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1513 else
1514 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1515 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1516
1517 return IC.replaceInstUsesWith(II, Res);
1518}
1519
1520static std::optional<Instruction *>
1522 Value *Pred = II.getOperand(0);
1523 Value *PtrOp = II.getOperand(1);
1524 Type *VecTy = II.getType();
1525
1526 // Replace by zero constant when all lanes are inactive
1527 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1528 return II_NA;
1529
1530 if (isAllActivePredicate(Pred)) {
1531 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1532 Load->copyMetadata(II);
1533 return IC.replaceInstUsesWith(II, Load);
1534 }
1535
1536 CallInst *MaskedLoad =
1537 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1538 Pred, ConstantAggregateZero::get(VecTy));
1539 MaskedLoad->copyMetadata(II);
1540 return IC.replaceInstUsesWith(II, MaskedLoad);
1541}
1542
1543static std::optional<Instruction *>
1545 Value *VecOp = II.getOperand(0);
1546 Value *Pred = II.getOperand(1);
1547 Value *PtrOp = II.getOperand(2);
1548
1549 if (isAllActivePredicate(Pred)) {
1550 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1551 Store->copyMetadata(II);
1552 return IC.eraseInstFromFunction(II);
1553 }
1554
1555 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1556 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1557 MaskedStore->copyMetadata(II);
1558 return IC.eraseInstFromFunction(II);
1559}
1560
1562 switch (Intrinsic) {
1563 case Intrinsic::aarch64_sve_fmul_u:
1564 return Instruction::BinaryOps::FMul;
1565 case Intrinsic::aarch64_sve_fadd_u:
1566 return Instruction::BinaryOps::FAdd;
1567 case Intrinsic::aarch64_sve_fsub_u:
1568 return Instruction::BinaryOps::FSub;
1569 default:
1570 return Instruction::BinaryOpsEnd;
1571 }
1572}
1573
1574static std::optional<Instruction *>
1576 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1577 if (II.isStrictFP())
1578 return std::nullopt;
1579
1580 auto *OpPredicate = II.getOperand(0);
1581 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1582 if (BinOpCode == Instruction::BinaryOpsEnd ||
1583 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1584 m_ConstantInt<AArch64SVEPredPattern::all>())))
1585 return std::nullopt;
1587 IC.Builder.setFastMathFlags(II.getFastMathFlags());
1588 auto BinOp =
1589 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1590 return IC.replaceInstUsesWith(II, BinOp);
1591}
1592
1593// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1594// sve.add_u).
1595static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1596 Intrinsic::ID IID) {
1597 auto *OpPredicate = II.getOperand(0);
1598 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1599 m_ConstantInt<AArch64SVEPredPattern::all>())))
1600 return std::nullopt;
1601
1602 auto *Mod = II.getModule();
1603 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1604 II.setCalledFunction(NewDecl);
1605
1606 return &II;
1607}
1608
1609// Simplify operations where predicate has all inactive lanes or try to replace
1610// with _u form when all lanes are active
1611static std::optional<Instruction *>
1613 Intrinsic::ID IID) {
1614 if (match(II.getOperand(0), m_ZeroInt())) {
1615 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1616 // inactive for sv[func]_m
1617 return IC.replaceInstUsesWith(II, II.getOperand(1));
1618 }
1619 return instCombineSVEAllActive(II, IID);
1620}
1621
1622static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1623 IntrinsicInst &II) {
1624 if (auto II_U =
1625 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1626 return II_U;
1627 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1628 Intrinsic::aarch64_sve_mla>(
1629 IC, II, true))
1630 return MLA;
1631 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1632 Intrinsic::aarch64_sve_mad>(
1633 IC, II, false))
1634 return MAD;
1635 return std::nullopt;
1636}
1637
1638static std::optional<Instruction *>
1640 if (auto II_U =
1641 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1642 return II_U;
1643 if (auto FMLA =
1644 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1645 Intrinsic::aarch64_sve_fmla>(IC, II,
1646 true))
1647 return FMLA;
1648 if (auto FMAD =
1649 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1650 Intrinsic::aarch64_sve_fmad>(IC, II,
1651 false))
1652 return FMAD;
1653 if (auto FMLA =
1654 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1655 Intrinsic::aarch64_sve_fmla>(IC, II,
1656 true))
1657 return FMLA;
1658 return std::nullopt;
1659}
1660
1661static std::optional<Instruction *>
1663 if (auto FMLA =
1664 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1665 Intrinsic::aarch64_sve_fmla>(IC, II,
1666 true))
1667 return FMLA;
1668 if (auto FMAD =
1669 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1670 Intrinsic::aarch64_sve_fmad>(IC, II,
1671 false))
1672 return FMAD;
1673 if (auto FMLA_U =
1674 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1675 Intrinsic::aarch64_sve_fmla_u>(
1676 IC, II, true))
1677 return FMLA_U;
1678 return instCombineSVEVectorBinOp(IC, II);
1679}
1680
1681static std::optional<Instruction *>
1683 if (auto II_U =
1684 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1685 return II_U;
1686 if (auto FMLS =
1687 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1688 Intrinsic::aarch64_sve_fmls>(IC, II,
1689 true))
1690 return FMLS;
1691 if (auto FMSB =
1692 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1693 Intrinsic::aarch64_sve_fnmsb>(
1694 IC, II, false))
1695 return FMSB;
1696 if (auto FMLS =
1697 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1698 Intrinsic::aarch64_sve_fmls>(IC, II,
1699 true))
1700 return FMLS;
1701 return std::nullopt;
1702}
1703
1704static std::optional<Instruction *>
1706 if (auto FMLS =
1707 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1708 Intrinsic::aarch64_sve_fmls>(IC, II,
1709 true))
1710 return FMLS;
1711 if (auto FMSB =
1712 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1713 Intrinsic::aarch64_sve_fnmsb>(
1714 IC, II, false))
1715 return FMSB;
1716 if (auto FMLS_U =
1717 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1718 Intrinsic::aarch64_sve_fmls_u>(
1719 IC, II, true))
1720 return FMLS_U;
1721 return instCombineSVEVectorBinOp(IC, II);
1722}
1723
1724static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1725 IntrinsicInst &II) {
1726 if (auto II_U =
1727 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1728 return II_U;
1729 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1730 Intrinsic::aarch64_sve_mls>(
1731 IC, II, true))
1732 return MLS;
1733 return std::nullopt;
1734}
1735
1736static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1738 Intrinsic::ID IID) {
1739 auto *OpPredicate = II.getOperand(0);
1740 auto *OpMultiplicand = II.getOperand(1);
1741 auto *OpMultiplier = II.getOperand(2);
1742
1743 // Return true if a given instruction is a unit splat value, false otherwise.
1744 auto IsUnitSplat = [](auto *I) {
1745 auto *SplatValue = getSplatValue(I);
1746 if (!SplatValue)
1747 return false;
1748 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1749 };
1750
1751 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1752 // with a unit splat value, false otherwise.
1753 auto IsUnitDup = [](auto *I) {
1754 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1755 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1756 return false;
1757
1758 auto *SplatValue = IntrI->getOperand(2);
1759 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1760 };
1761
1762 if (IsUnitSplat(OpMultiplier)) {
1763 // [f]mul pg %n, (dupx 1) => %n
1764 OpMultiplicand->takeName(&II);
1765 return IC.replaceInstUsesWith(II, OpMultiplicand);
1766 } else if (IsUnitDup(OpMultiplier)) {
1767 // [f]mul pg %n, (dup pg 1) => %n
1768 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1769 auto *DupPg = DupInst->getOperand(1);
1770 // TODO: this is naive. The optimization is still valid if DupPg
1771 // 'encompasses' OpPredicate, not only if they're the same predicate.
1772 if (OpPredicate == DupPg) {
1773 OpMultiplicand->takeName(&II);
1774 return IC.replaceInstUsesWith(II, OpMultiplicand);
1775 }
1776 }
1777
1778 return instCombineSVEVectorBinOp(IC, II);
1779}
1780
1781static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1782 IntrinsicInst &II) {
1783 Value *UnpackArg = II.getArgOperand(0);
1784 auto *RetTy = cast<ScalableVectorType>(II.getType());
1785 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1786 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1787
1788 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1789 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1790 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1791 ScalarArg =
1792 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1793 Value *NewVal =
1794 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1795 NewVal->takeName(&II);
1796 return IC.replaceInstUsesWith(II, NewVal);
1797 }
1798
1799 return std::nullopt;
1800}
1801static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1802 IntrinsicInst &II) {
1803 auto *OpVal = II.getOperand(0);
1804 auto *OpIndices = II.getOperand(1);
1805 VectorType *VTy = cast<VectorType>(II.getType());
1806
1807 // Check whether OpIndices is a constant splat value < minimal element count
1808 // of result.
1809 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1810 if (!SplatValue ||
1811 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1812 return std::nullopt;
1813
1814 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1815 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1816 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1817 auto *VectorSplat =
1818 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1819
1820 VectorSplat->takeName(&II);
1821 return IC.replaceInstUsesWith(II, VectorSplat);
1822}
1823
1824static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1825 IntrinsicInst &II) {
1826 Value *A, *B;
1827 Type *RetTy = II.getType();
1828 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1829 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1830
1831 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1832 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1833 if ((match(II.getArgOperand(0),
1834 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1835 match(II.getArgOperand(1),
1836 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1837 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1838 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1839 auto *TyA = cast<ScalableVectorType>(A->getType());
1840 if (TyA == B->getType() &&
1842 auto *SubVec = IC.Builder.CreateInsertVector(
1844 auto *ConcatVec = IC.Builder.CreateInsertVector(
1845 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1846 ConcatVec->takeName(&II);
1847 return IC.replaceInstUsesWith(II, ConcatVec);
1848 }
1849 }
1850
1851 return std::nullopt;
1852}
1853
1854static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1855 IntrinsicInst &II) {
1856 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1857 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1858 Value *A, *B;
1859 if (match(II.getArgOperand(0),
1860 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1861 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1862 m_Specific(A), m_Specific(B))))
1863 return IC.replaceInstUsesWith(
1864 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1865
1866 return std::nullopt;
1867}
1868
1869static std::optional<Instruction *>
1871 Value *Mask = II.getOperand(0);
1872 Value *BasePtr = II.getOperand(1);
1873 Value *Index = II.getOperand(2);
1874 Type *Ty = II.getType();
1875 Value *PassThru = ConstantAggregateZero::get(Ty);
1876
1877 // Replace by zero constant when all lanes are inactive
1878 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1879 return II_NA;
1880
1881 // Contiguous gather => masked load.
1882 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1883 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1884 Value *IndexBase;
1885 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1886 m_Value(IndexBase), m_SpecificInt(1)))) {
1887 Align Alignment =
1888 BasePtr->getPointerAlignment(II.getDataLayout());
1889
1890 Type *VecPtrTy = PointerType::getUnqual(Ty);
1891 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1892 BasePtr, IndexBase);
1893 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1894 CallInst *MaskedLoad =
1895 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1896 MaskedLoad->takeName(&II);
1897 return IC.replaceInstUsesWith(II, MaskedLoad);
1898 }
1899
1900 return std::nullopt;
1901}
1902
1903static std::optional<Instruction *>
1905 Value *Val = II.getOperand(0);
1906 Value *Mask = II.getOperand(1);
1907 Value *BasePtr = II.getOperand(2);
1908 Value *Index = II.getOperand(3);
1909 Type *Ty = Val->getType();
1910
1911 // Contiguous scatter => masked store.
1912 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1913 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1914 Value *IndexBase;
1915 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1916 m_Value(IndexBase), m_SpecificInt(1)))) {
1917 Align Alignment =
1918 BasePtr->getPointerAlignment(II.getDataLayout());
1919
1920 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1921 BasePtr, IndexBase);
1922 Type *VecPtrTy = PointerType::getUnqual(Ty);
1923 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1924
1925 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1926
1927 return IC.eraseInstFromFunction(II);
1928 }
1929
1930 return std::nullopt;
1931}
1932
1933static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1934 IntrinsicInst &II) {
1935 Type *Int32Ty = IC.Builder.getInt32Ty();
1936 Value *Pred = II.getOperand(0);
1937 Value *Vec = II.getOperand(1);
1938 Value *DivVec = II.getOperand(2);
1939
1940 Value *SplatValue = getSplatValue(DivVec);
1941 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1942 if (!SplatConstantInt)
1943 return std::nullopt;
1944 APInt Divisor = SplatConstantInt->getValue();
1945
1946 if (Divisor.isPowerOf2()) {
1947 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1948 auto ASRD = IC.Builder.CreateIntrinsic(
1949 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1950 return IC.replaceInstUsesWith(II, ASRD);
1951 }
1952 if (Divisor.isNegatedPowerOf2()) {
1953 Divisor.negate();
1954 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1955 auto ASRD = IC.Builder.CreateIntrinsic(
1956 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1957 auto NEG = IC.Builder.CreateIntrinsic(
1958 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1959 return IC.replaceInstUsesWith(II, NEG);
1960 }
1961
1962 return std::nullopt;
1963}
1964
1965bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1966 size_t VecSize = Vec.size();
1967 if (VecSize == 1)
1968 return true;
1969 if (!isPowerOf2_64(VecSize))
1970 return false;
1971 size_t HalfVecSize = VecSize / 2;
1972
1973 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1974 RHS != Vec.end(); LHS++, RHS++) {
1975 if (*LHS != nullptr && *RHS != nullptr) {
1976 if (*LHS == *RHS)
1977 continue;
1978 else
1979 return false;
1980 }
1981 if (!AllowPoison)
1982 return false;
1983 if (*LHS == nullptr && *RHS != nullptr)
1984 *LHS = *RHS;
1985 }
1986
1987 Vec.resize(HalfVecSize);
1988 SimplifyValuePattern(Vec, AllowPoison);
1989 return true;
1990}
1991
1992// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1993// to dupqlane(f64(C)) where C is A concatenated with B
1994static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1995 IntrinsicInst &II) {
1996 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1997 if (!match(II.getOperand(0),
1998 m_Intrinsic<Intrinsic::vector_insert>(
1999 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2000 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2001 return std::nullopt;
2002 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2003
2004 // Insert the scalars into a container ordered by InsertElement index
2005 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2006 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2007 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2008 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2009 CurrentInsertElt = InsertElt->getOperand(0);
2010 }
2011
2012 bool AllowPoison =
2013 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2014 if (!SimplifyValuePattern(Elts, AllowPoison))
2015 return std::nullopt;
2016
2017 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2018 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2019 for (size_t I = 0; I < Elts.size(); I++) {
2020 if (Elts[I] == nullptr)
2021 continue;
2022 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2023 IC.Builder.getInt64(I));
2024 }
2025 if (InsertEltChain == nullptr)
2026 return std::nullopt;
2027
2028 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2029 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2030 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2031 // be narrowed back to the original type.
2032 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2033 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2034 IIScalableTy->getMinNumElements() /
2035 PatternWidth;
2036
2037 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2038 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2039 auto *WideShuffleMaskTy =
2040 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2041
2042 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
2043 auto InsertSubvector = IC.Builder.CreateInsertVector(
2044 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
2045 auto WideBitcast =
2046 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2047 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2048 auto WideShuffle = IC.Builder.CreateShuffleVector(
2049 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2050 auto NarrowBitcast =
2051 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2052
2053 return IC.replaceInstUsesWith(II, NarrowBitcast);
2054}
2055
2056static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2057 IntrinsicInst &II) {
2058 Value *A = II.getArgOperand(0);
2059 Value *B = II.getArgOperand(1);
2060 if (A == B)
2061 return IC.replaceInstUsesWith(II, A);
2062
2063 return std::nullopt;
2064}
2065
2066static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2067 IntrinsicInst &II) {
2068 Value *Pred = II.getOperand(0);
2069 Value *Vec = II.getOperand(1);
2070 Value *Shift = II.getOperand(2);
2071
2072 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2073 Value *AbsPred, *MergedValue;
2074 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2075 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2076 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2077 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2078
2079 return std::nullopt;
2080
2081 // Transform is valid if any of the following are true:
2082 // * The ABS merge value is an undef or non-negative
2083 // * The ABS predicate is all active
2084 // * The ABS predicate and the SRSHL predicates are the same
2085 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2086 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2087 return std::nullopt;
2088
2089 // Only valid when the shift amount is non-negative, otherwise the rounding
2090 // behaviour of SRSHL cannot be ignored.
2091 if (!match(Shift, m_NonNegative()))
2092 return std::nullopt;
2093
2094 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2095 {II.getType()}, {Pred, Vec, Shift});
2096
2097 return IC.replaceInstUsesWith(II, LSL);
2098}
2099
2100std::optional<Instruction *>
2102 IntrinsicInst &II) const {
2103 Intrinsic::ID IID = II.getIntrinsicID();
2104 switch (IID) {
2105 default:
2106 break;
2107
2108 case Intrinsic::aarch64_sve_st1_scatter:
2109 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2110 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2111 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2112 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2113 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2114 case Intrinsic::aarch64_sve_st1dq:
2115 case Intrinsic::aarch64_sve_st1q_scatter_index:
2116 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2117 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2118 case Intrinsic::aarch64_sve_st1wq:
2119 case Intrinsic::aarch64_sve_stnt1:
2120 case Intrinsic::aarch64_sve_stnt1_scatter:
2121 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2122 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2123 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2124 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2125 case Intrinsic::aarch64_sve_st2:
2126 case Intrinsic::aarch64_sve_st2q:
2127 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2128 case Intrinsic::aarch64_sve_st3:
2129 case Intrinsic::aarch64_sve_st3q:
2130 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2131 case Intrinsic::aarch64_sve_st4:
2132 case Intrinsic::aarch64_sve_st4q:
2133 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2134 case Intrinsic::aarch64_sve_ld1_gather:
2135 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2136 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2137 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2138 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2139 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2140 case Intrinsic::aarch64_sve_ld1q_gather_index:
2141 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2142 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2143 case Intrinsic::aarch64_sve_ld1ro:
2144 case Intrinsic::aarch64_sve_ld1rq:
2145 case Intrinsic::aarch64_sve_ld1udq:
2146 case Intrinsic::aarch64_sve_ld1uwq:
2147 case Intrinsic::aarch64_sve_ld2_sret:
2148 case Intrinsic::aarch64_sve_ld2q_sret:
2149 case Intrinsic::aarch64_sve_ld3_sret:
2150 case Intrinsic::aarch64_sve_ld3q_sret:
2151 case Intrinsic::aarch64_sve_ld4_sret:
2152 case Intrinsic::aarch64_sve_ld4q_sret:
2153 case Intrinsic::aarch64_sve_ldff1:
2154 case Intrinsic::aarch64_sve_ldff1_gather:
2155 case Intrinsic::aarch64_sve_ldff1_gather_index:
2156 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2157 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2158 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2159 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2160 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2161 case Intrinsic::aarch64_sve_ldnf1:
2162 case Intrinsic::aarch64_sve_ldnt1:
2163 case Intrinsic::aarch64_sve_ldnt1_gather:
2164 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2165 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2166 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2168 case Intrinsic::aarch64_sve_prf:
2169 case Intrinsic::aarch64_sve_prfb_gather_index:
2170 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
2171 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
2172 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
2173 case Intrinsic::aarch64_sve_prfd_gather_index:
2174 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
2175 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
2176 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
2177 case Intrinsic::aarch64_sve_prfh_gather_index:
2178 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
2179 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
2180 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
2181 case Intrinsic::aarch64_sve_prfw_gather_index:
2182 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
2183 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
2184 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
2185 return instCombineSVENoActiveUnaryErase(IC, II, 0);
2186 case Intrinsic::aarch64_neon_fmaxnm:
2187 case Intrinsic::aarch64_neon_fminnm:
2188 return instCombineMaxMinNM(IC, II);
2189 case Intrinsic::aarch64_sve_convert_from_svbool:
2190 return instCombineConvertFromSVBool(IC, II);
2191 case Intrinsic::aarch64_sve_dup:
2192 return instCombineSVEDup(IC, II);
2193 case Intrinsic::aarch64_sve_dup_x:
2194 return instCombineSVEDupX(IC, II);
2195 case Intrinsic::aarch64_sve_cmpne:
2196 case Intrinsic::aarch64_sve_cmpne_wide:
2197 return instCombineSVECmpNE(IC, II);
2198 case Intrinsic::aarch64_sve_rdffr:
2199 return instCombineRDFFR(IC, II);
2200 case Intrinsic::aarch64_sve_lasta:
2201 case Intrinsic::aarch64_sve_lastb:
2202 return instCombineSVELast(IC, II);
2203 case Intrinsic::aarch64_sve_clasta_n:
2204 case Intrinsic::aarch64_sve_clastb_n:
2205 return instCombineSVECondLast(IC, II);
2206 case Intrinsic::aarch64_sve_cntd:
2207 return instCombineSVECntElts(IC, II, 2);
2208 case Intrinsic::aarch64_sve_cntw:
2209 return instCombineSVECntElts(IC, II, 4);
2210 case Intrinsic::aarch64_sve_cnth:
2211 return instCombineSVECntElts(IC, II, 8);
2212 case Intrinsic::aarch64_sve_cntb:
2213 return instCombineSVECntElts(IC, II, 16);
2214 case Intrinsic::aarch64_sve_ptest_any:
2215 case Intrinsic::aarch64_sve_ptest_first:
2216 case Intrinsic::aarch64_sve_ptest_last:
2217 return instCombineSVEPTest(IC, II);
2218 case Intrinsic::aarch64_sve_fabd:
2219 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2220 case Intrinsic::aarch64_sve_fadd:
2221 return instCombineSVEVectorFAdd(IC, II);
2222 case Intrinsic::aarch64_sve_fadd_u:
2223 return instCombineSVEVectorFAddU(IC, II);
2224 case Intrinsic::aarch64_sve_fdiv:
2225 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2226 case Intrinsic::aarch64_sve_fmax:
2227 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2228 case Intrinsic::aarch64_sve_fmaxnm:
2229 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2230 case Intrinsic::aarch64_sve_fmin:
2231 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2232 case Intrinsic::aarch64_sve_fminnm:
2233 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2234 case Intrinsic::aarch64_sve_fmla:
2235 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2236 case Intrinsic::aarch64_sve_fmls:
2237 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2238 case Intrinsic::aarch64_sve_fmul:
2239 if (auto II_U =
2240 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2241 return II_U;
2242 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2243 case Intrinsic::aarch64_sve_fmul_u:
2244 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2245 case Intrinsic::aarch64_sve_fmulx:
2246 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2247 case Intrinsic::aarch64_sve_fnmla:
2248 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2249 case Intrinsic::aarch64_sve_fnmls:
2250 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2251 case Intrinsic::aarch64_sve_fsub:
2252 return instCombineSVEVectorFSub(IC, II);
2253 case Intrinsic::aarch64_sve_fsub_u:
2254 return instCombineSVEVectorFSubU(IC, II);
2255 case Intrinsic::aarch64_sve_add:
2256 return instCombineSVEVectorAdd(IC, II);
2257 case Intrinsic::aarch64_sve_add_u:
2258 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2259 Intrinsic::aarch64_sve_mla_u>(
2260 IC, II, true);
2261 case Intrinsic::aarch64_sve_mla:
2262 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2263 case Intrinsic::aarch64_sve_mls:
2264 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2265 case Intrinsic::aarch64_sve_mul:
2266 if (auto II_U =
2267 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2268 return II_U;
2269 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2270 case Intrinsic::aarch64_sve_mul_u:
2271 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2272 case Intrinsic::aarch64_sve_sabd:
2273 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2274 case Intrinsic::aarch64_sve_smax:
2275 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2276 case Intrinsic::aarch64_sve_smin:
2277 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2278 case Intrinsic::aarch64_sve_smulh:
2279 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2280 case Intrinsic::aarch64_sve_sub:
2281 return instCombineSVEVectorSub(IC, II);
2282 case Intrinsic::aarch64_sve_sub_u:
2283 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2284 Intrinsic::aarch64_sve_mls_u>(
2285 IC, II, true);
2286 case Intrinsic::aarch64_sve_uabd:
2287 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2288 case Intrinsic::aarch64_sve_umax:
2289 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2290 case Intrinsic::aarch64_sve_umin:
2291 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2292 case Intrinsic::aarch64_sve_umulh:
2293 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2294 case Intrinsic::aarch64_sve_asr:
2295 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2296 case Intrinsic::aarch64_sve_lsl:
2297 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2298 case Intrinsic::aarch64_sve_lsr:
2299 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2300 case Intrinsic::aarch64_sve_and:
2301 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2302 case Intrinsic::aarch64_sve_bic:
2303 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2304 case Intrinsic::aarch64_sve_eor:
2305 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2306 case Intrinsic::aarch64_sve_orr:
2307 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2308 case Intrinsic::aarch64_sve_sqsub:
2309 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2310 case Intrinsic::aarch64_sve_uqsub:
2311 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2312 case Intrinsic::aarch64_sve_tbl:
2313 return instCombineSVETBL(IC, II);
2314 case Intrinsic::aarch64_sve_uunpkhi:
2315 case Intrinsic::aarch64_sve_uunpklo:
2316 case Intrinsic::aarch64_sve_sunpkhi:
2317 case Intrinsic::aarch64_sve_sunpklo:
2318 return instCombineSVEUnpack(IC, II);
2319 case Intrinsic::aarch64_sve_uzp1:
2320 return instCombineSVEUzp1(IC, II);
2321 case Intrinsic::aarch64_sve_zip1:
2322 case Intrinsic::aarch64_sve_zip2:
2323 return instCombineSVEZip(IC, II);
2324 case Intrinsic::aarch64_sve_ld1_gather_index:
2325 return instCombineLD1GatherIndex(IC, II);
2326 case Intrinsic::aarch64_sve_st1_scatter_index:
2327 return instCombineST1ScatterIndex(IC, II);
2328 case Intrinsic::aarch64_sve_ld1:
2329 return instCombineSVELD1(IC, II, DL);
2330 case Intrinsic::aarch64_sve_st1:
2331 return instCombineSVEST1(IC, II, DL);
2332 case Intrinsic::aarch64_sve_sdiv:
2333 return instCombineSVESDIV(IC, II);
2334 case Intrinsic::aarch64_sve_sel:
2335 return instCombineSVESel(IC, II);
2336 case Intrinsic::aarch64_sve_srshl:
2337 return instCombineSVESrshl(IC, II);
2338 case Intrinsic::aarch64_sve_dupq_lane:
2339 return instCombineSVEDupqLane(IC, II);
2340 }
2341
2342 return std::nullopt;
2343}
2344
2346 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2347 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2348 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2349 SimplifyAndSetOp) const {
2350 switch (II.getIntrinsicID()) {
2351 default:
2352 break;
2353 case Intrinsic::aarch64_neon_fcvtxn:
2354 case Intrinsic::aarch64_neon_rshrn:
2355 case Intrinsic::aarch64_neon_sqrshrn:
2356 case Intrinsic::aarch64_neon_sqrshrun:
2357 case Intrinsic::aarch64_neon_sqshrn:
2358 case Intrinsic::aarch64_neon_sqshrun:
2359 case Intrinsic::aarch64_neon_sqxtn:
2360 case Intrinsic::aarch64_neon_sqxtun:
2361 case Intrinsic::aarch64_neon_uqrshrn:
2362 case Intrinsic::aarch64_neon_uqshrn:
2363 case Intrinsic::aarch64_neon_uqxtn:
2364 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2365 break;
2366 }
2367
2368 return std::nullopt;
2369}
2370
2372 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2374}
2375
2378 switch (K) {
2380 return TypeSize::getFixed(64);
2382 if (ST->useSVEForFixedLengthVectors() &&
2384 return TypeSize::getFixed(
2385 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2386 else if (ST->isNeonAvailable())
2387 return TypeSize::getFixed(128);
2388 else
2389 return TypeSize::getFixed(0);
2391 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2393 return TypeSize::getScalable(128);
2394 else
2395 return TypeSize::getScalable(0);
2396 }
2397 llvm_unreachable("Unsupported register kind");
2398}
2399
2400bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2402 Type *SrcOverrideTy) {
2403 // A helper that returns a vector type from the given type. The number of
2404 // elements in type Ty determines the vector width.
2405 auto toVectorTy = [&](Type *ArgTy) {
2406 return VectorType::get(ArgTy->getScalarType(),
2407 cast<VectorType>(DstTy)->getElementCount());
2408 };
2409
2410 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2411 // i32, i64]. SVE doesn't generally have the same set of instructions to
2412 // perform an extend with the add/sub/mul. There are SMULLB style
2413 // instructions, but they operate on top/bottom, requiring some sort of lane
2414 // interleaving to be used with zext/sext.
2415 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2416 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2417 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2418 return false;
2419
2420 // Determine if the operation has a widening variant. We consider both the
2421 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2422 // instructions.
2423 //
2424 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2425 // verify that their extending operands are eliminated during code
2426 // generation.
2427 Type *SrcTy = SrcOverrideTy;
2428 switch (Opcode) {
2429 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2430 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2431 // The second operand needs to be an extend
2432 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2433 if (!SrcTy)
2434 SrcTy =
2435 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2436 } else
2437 return false;
2438 break;
2439 case Instruction::Mul: { // SMULL(2), UMULL(2)
2440 // Both operands need to be extends of the same type.
2441 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2442 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2443 if (!SrcTy)
2444 SrcTy =
2445 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2446 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2447 // If one of the operands is a Zext and the other has enough zero bits to
2448 // be treated as unsigned, we can still general a umull, meaning the zext
2449 // is free.
2450 KnownBits Known =
2451 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2452 if (Args[0]->getType()->getScalarSizeInBits() -
2453 Known.Zero.countLeadingOnes() >
2454 DstTy->getScalarSizeInBits() / 2)
2455 return false;
2456 if (!SrcTy)
2457 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2458 DstTy->getScalarSizeInBits() / 2));
2459 } else
2460 return false;
2461 break;
2462 }
2463 default:
2464 return false;
2465 }
2466
2467 // Legalize the destination type and ensure it can be used in a widening
2468 // operation.
2469 auto DstTyL = getTypeLegalizationCost(DstTy);
2470 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2471 return false;
2472
2473 // Legalize the source type and ensure it can be used in a widening
2474 // operation.
2475 assert(SrcTy && "Expected some SrcTy");
2476 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2477 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2478 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2479 return false;
2480
2481 // Get the total number of vector elements in the legalized types.
2482 InstructionCost NumDstEls =
2483 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2484 InstructionCost NumSrcEls =
2485 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2486
2487 // Return true if the legalized types have the same number of vector elements
2488 // and the destination element type size is twice that of the source type.
2489 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2490}
2491
2492// s/urhadd instructions implement the following pattern, making the
2493// extends free:
2494// %x = add ((zext i8 -> i16), 1)
2495// %y = (zext i8 -> i16)
2496// trunc i16 (lshr (add %x, %y), 1) -> i8
2497//
2499 Type *Src) {
2500 // The source should be a legal vector type.
2501 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2502 (Src->isScalableTy() && !ST->hasSVE2()))
2503 return false;
2504
2505 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2506 return false;
2507
2508 // Look for trunc/shl/add before trying to match the pattern.
2509 const Instruction *Add = ExtUser;
2510 auto *AddUser =
2511 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2512 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2513 Add = AddUser;
2514
2515 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2516 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2517 return false;
2518
2519 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2520 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2521 Src->getScalarSizeInBits() !=
2522 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2523 return false;
2524
2525 // Try to match the whole pattern. Ext could be either the first or second
2526 // m_ZExtOrSExt matched.
2527 Instruction *Ex1, *Ex2;
2528 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2529 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2530 return false;
2531
2532 // Ensure both extends are of the same type
2533 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2534 Ex1->getOpcode() == Ex2->getOpcode())
2535 return true;
2536
2537 return false;
2538}
2539
2541 Type *Src,
2544 const Instruction *I) {
2545 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2546 assert(ISD && "Invalid opcode");
2547 // If the cast is observable, and it is used by a widening instruction (e.g.,
2548 // uaddl, saddw, etc.), it may be free.
2549 if (I && I->hasOneUser()) {
2550 auto *SingleUser = cast<Instruction>(*I->user_begin());
2551 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2552 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2553 // For adds only count the second operand as free if both operands are
2554 // extends but not the same operation. (i.e both operands are not free in
2555 // add(sext, zext)).
2556 if (SingleUser->getOpcode() == Instruction::Add) {
2557 if (I == SingleUser->getOperand(1) ||
2558 (isa<CastInst>(SingleUser->getOperand(1)) &&
2559 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2560 return 0;
2561 } else // Others are free so long as isWideningInstruction returned true.
2562 return 0;
2563 }
2564
2565 // The cast will be free for the s/urhadd instructions
2566 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2567 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2568 return 0;
2569 }
2570
2571 // TODO: Allow non-throughput costs that aren't binary.
2572 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2574 return Cost == 0 ? 0 : 1;
2575 return Cost;
2576 };
2577
2578 EVT SrcTy = TLI->getValueType(DL, Src);
2579 EVT DstTy = TLI->getValueType(DL, Dst);
2580
2581 if (!SrcTy.isSimple() || !DstTy.isSimple())
2582 return AdjustCost(
2583 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2584
2585 static const TypeConversionCostTblEntry
2586 ConversionTbl[] = {
2587 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2588 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2589 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2590 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2591 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2592 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2593 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2594 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2595 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2596 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2597 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2598 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2599 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2600 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2601 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2602 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2603 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2604 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2605 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2606 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2607
2608 // Truncations on nxvmiN
2609 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2610 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2611 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2612 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2613 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2614 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2615 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2616 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2617 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2618 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2619 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2620 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2621 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2622 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2623 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2624 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2625
2626 // The number of shll instructions for the extension.
2627 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2628 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2629 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2630 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2631 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2632 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2633 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2634 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2635 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2636 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2637 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2638 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2639 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2640 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2641 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2642 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2643
2644 // LowerVectorINT_TO_FP:
2645 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2646 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2647 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2648 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2649 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2650 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2651
2652 // Complex: to v2f32
2653 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2654 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2655 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2656 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2657 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2658 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2659
2660 // Complex: to v4f32
2661 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2662 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2663 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2664 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2665
2666 // Complex: to v8f32
2667 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2668 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2669 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2670 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2671
2672 // Complex: to v16f32
2673 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2674 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2675
2676 // Complex: to v2f64
2677 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2678 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2679 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2680 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2681 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2682 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2683
2684 // Complex: to v4f64
2685 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2686 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2687
2688 // LowerVectorFP_TO_INT
2689 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2690 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2691 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2692 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2693 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2694 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2695
2696 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2697 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2698 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2699 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2700 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2701 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2702 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2703
2704 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2705 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2706 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2707 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2708 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2709
2710 // Complex, from nxv2f32.
2711 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2712 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2713 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2714 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2715 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2716 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2717 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2718 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2719
2720 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2721 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2722 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2723 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2724 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2725 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2726 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2727
2728 // Complex, from nxv2f64.
2729 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2730 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2731 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2732 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2733 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2734 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2735 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2736 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2737
2738 // Complex, from nxv4f32.
2739 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2740 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2741 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2742 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2743 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2744 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2745 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2746 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2747
2748 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2749 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2750 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2751 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2752 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2753
2754 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2755 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2756 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2757 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2758 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2759 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2760 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2761
2762 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2763 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2764 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2765 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2766 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2767
2768 // Complex, from nxv8f16.
2769 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2770 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2771 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2772 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2773 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2774 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2775 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2776 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2777
2778 // Complex, from nxv4f16.
2779 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2780 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2781 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2782 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2783 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2784 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2785 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2786 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2787
2788 // Complex, from nxv2f16.
2789 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2790 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2791 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2792 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2793 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2794 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2795 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2796 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2797
2798 // Truncate from nxvmf32 to nxvmf16.
2799 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2800 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2801 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2802
2803 // Truncate from nxvmf64 to nxvmf16.
2804 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2805 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2806 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2807
2808 // Truncate from nxvmf64 to nxvmf32.
2809 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2810 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2811 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2812
2813 // Extend from nxvmf16 to nxvmf32.
2814 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2815 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2816 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2817
2818 // Extend from nxvmf16 to nxvmf64.
2819 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2820 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2821 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2822
2823 // Extend from nxvmf32 to nxvmf64.
2824 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2825 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2826 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2827
2828 // Bitcasts from float to integer
2829 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2830 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2831 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2832
2833 // Bitcasts from integer to float
2834 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2835 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2836 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2837
2838 // Add cost for extending to illegal -too wide- scalable vectors.
2839 // zero/sign extend are implemented by multiple unpack operations,
2840 // where each operation has a cost of 1.
2841 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2842 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2843 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2844 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2845 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2846 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2847
2848 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2849 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2850 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2851 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2852 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2853 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2854 };
2855
2856 // We have to estimate a cost of fixed length operation upon
2857 // SVE registers(operations) with the number of registers required
2858 // for a fixed type to be represented upon SVE registers.
2859 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2860 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2861 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2862 ST->useSVEForFixedLengthVectors(WiderTy)) {
2863 std::pair<InstructionCost, MVT> LT =
2864 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2865 unsigned NumElements = AArch64::SVEBitsPerBlock /
2866 LT.second.getScalarSizeInBits();
2867 return AdjustCost(
2868 LT.first *
2870 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2871 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2872 CostKind, I));
2873 }
2874
2875 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2876 DstTy.getSimpleVT(),
2877 SrcTy.getSimpleVT()))
2878 return AdjustCost(Entry->Cost);
2879
2880 static const TypeConversionCostTblEntry FP16Tbl[] = {
2881 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2882 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2883 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2884 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2885 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2886 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2887 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2888 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2889 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2890 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2891 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2892 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2893 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2894 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2895 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2896 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2897 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2898 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2899 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2900 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2901 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2902 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2903 };
2904
2905 if (ST->hasFullFP16())
2906 if (const auto *Entry = ConvertCostTableLookup(
2907 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2908 return AdjustCost(Entry->Cost);
2909
2910 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2913 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2915 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2917 // The standard behaviour in the backend for these cases is to split the
2918 // extend up into two parts:
2919 // 1. Perform an extending load or masked load up to the legal type.
2920 // 2. Extend the loaded data to the final type.
2921 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2922 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2924 Opcode, LegalTy, Src, CCH, CostKind, I);
2926 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2927 return Part1 + Part2;
2928 }
2929
2930 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2931 // but we also want to include the TTI::CastContextHint::Masked case too.
2932 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2934 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
2936
2937 return AdjustCost(
2938 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2939}
2940
2942 Type *Dst,
2943 VectorType *VecTy,
2944 unsigned Index) {
2945
2946 // Make sure we were given a valid extend opcode.
2947 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2948 "Invalid opcode");
2949
2950 // We are extending an element we extract from a vector, so the source type
2951 // of the extend is the element type of the vector.
2952 auto *Src = VecTy->getElementType();
2953
2954 // Sign- and zero-extends are for integer types only.
2955 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2956
2957 // Get the cost for the extract. We compute the cost (if any) for the extend
2958 // below.
2960 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2961 CostKind, Index, nullptr, nullptr);
2962
2963 // Legalize the types.
2964 auto VecLT = getTypeLegalizationCost(VecTy);
2965 auto DstVT = TLI->getValueType(DL, Dst);
2966 auto SrcVT = TLI->getValueType(DL, Src);
2967
2968 // If the resulting type is still a vector and the destination type is legal,
2969 // we may get the extension for free. If not, get the default cost for the
2970 // extend.
2971 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2972 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2973 CostKind);
2974
2975 // The destination type should be larger than the element type. If not, get
2976 // the default cost for the extend.
2977 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2978 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2979 CostKind);
2980
2981 switch (Opcode) {
2982 default:
2983 llvm_unreachable("Opcode should be either SExt or ZExt");
2984
2985 // For sign-extends, we only need a smov, which performs the extension
2986 // automatically.
2987 case Instruction::SExt:
2988 return Cost;
2989
2990 // For zero-extends, the extend is performed automatically by a umov unless
2991 // the destination type is i64 and the element type is i8 or i16.
2992 case Instruction::ZExt:
2993 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2994 return Cost;
2995 }
2996
2997 // If we are unable to perform the extend for free, get the default cost.
2998 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2999 CostKind);
3000}
3001
3004 const Instruction *I) {
3006 return Opcode == Instruction::PHI ? 0 : 1;
3007 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3008 // Branches are assumed to be predicted.
3009 return 0;
3010}
3011
3012InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
3013 Type *Val,
3014 unsigned Index,
3015 bool HasRealUse) {
3016 assert(Val->isVectorTy() && "This must be a vector type");
3017
3018 if (Index != -1U) {
3019 // Legalize the type.
3020 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
3021
3022 // This type is legalized to a scalar type.
3023 if (!LT.second.isVector())
3024 return 0;
3025
3026 // The type may be split. For fixed-width vectors we can normalize the
3027 // index to the new type.
3028 if (LT.second.isFixedLengthVector()) {
3029 unsigned Width = LT.second.getVectorNumElements();
3030 Index = Index % Width;
3031 }
3032
3033 // The element at index zero is already inside the vector.
3034 // - For a physical (HasRealUse==true) insert-element or extract-element
3035 // instruction that extracts integers, an explicit FPR -> GPR move is
3036 // needed. So it has non-zero cost.
3037 // - For the rest of cases (virtual instruction or element type is float),
3038 // consider the instruction free.
3039 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3040 return 0;
3041
3042 // This is recognising a LD1 single-element structure to one lane of one
3043 // register instruction. I.e., if this is an `insertelement` instruction,
3044 // and its second operand is a load, then we will generate a LD1, which
3045 // are expensive instructions.
3046 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
3047 return ST->getVectorInsertExtractBaseCost() + 1;
3048
3049 // i1 inserts and extract will include an extra cset or cmp of the vector
3050 // value. Increase the cost by 1 to account.
3051 if (Val->getScalarSizeInBits() == 1)
3052 return ST->getVectorInsertExtractBaseCost() + 1;
3053
3054 // FIXME:
3055 // If the extract-element and insert-element instructions could be
3056 // simplified away (e.g., could be combined into users by looking at use-def
3057 // context), they have no cost. This is not done in the first place for
3058 // compile-time considerations.
3059 }
3060
3061 // All other insert/extracts cost this much.
3062 return ST->getVectorInsertExtractBaseCost();
3063}
3064
3067 unsigned Index, Value *Op0,
3068 Value *Op1) {
3069 bool HasRealUse =
3070 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3071 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
3072}
3073
3075 Type *Val,
3077 unsigned Index) {
3078 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
3079}
3080
3082 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3084 if (isa<ScalableVectorType>(Ty))
3086 if (Ty->getElementType()->isFloatingPointTy())
3087 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3088 CostKind);
3089 return DemandedElts.popcount() * (Insert + Extract) *
3091}
3092
3094 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3097 const Instruction *CxtI) {
3098
3099 // The code-generator is currently not able to handle scalable vectors
3100 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3101 // it. This change will be removed when code-generation for these types is
3102 // sufficiently reliable.
3103 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3104 if (VTy->getElementCount() == ElementCount::getScalable(1))
3106
3107 // TODO: Handle more cost kinds.
3109 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3110 Op2Info, Args, CxtI);
3111
3112 // Legalize the type.
3113 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3114 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3115
3116 switch (ISD) {
3117 default:
3118 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3119 Op2Info);
3120 case ISD::SDIV:
3121 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3122 // On AArch64, scalar signed division by constants power-of-two are
3123 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3124 // The OperandValue properties many not be same as that of previous
3125 // operation; conservatively assume OP_None.
3127 Instruction::Add, Ty, CostKind,
3128 Op1Info.getNoProps(), Op2Info.getNoProps());
3129 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3130 Op1Info.getNoProps(), Op2Info.getNoProps());
3132 Instruction::Select, Ty, CostKind,
3133 Op1Info.getNoProps(), Op2Info.getNoProps());
3134 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3135 Op1Info.getNoProps(), Op2Info.getNoProps());
3136 return Cost;
3137 }
3138 [[fallthrough]];
3139 case ISD::UDIV: {
3140 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3141 auto VT = TLI->getValueType(DL, Ty);
3142 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3143 // Vector signed division by constant are expanded to the
3144 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3145 // to MULHS + SUB + SRL + ADD + SRL.
3147 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3149 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3151 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3152 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3153 }
3154 }
3155
3157 Opcode, Ty, CostKind, Op1Info, Op2Info);
3158 if (Ty->isVectorTy()) {
3159 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3160 // SDIV/UDIV operations are lowered using SVE, then we can have less
3161 // costs.
3162 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3163 ->getPrimitiveSizeInBits()
3164 .getFixedValue() < 128) {
3165 EVT VT = TLI->getValueType(DL, Ty);
3166 static const CostTblEntry DivTbl[]{
3167 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3168 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3169 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3170 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3171 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3172 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3173
3174 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3175 if (nullptr != Entry)
3176 return Entry->Cost;
3177 }
3178 // For 8/16-bit elements, the cost is higher because the type
3179 // requires promotion and possibly splitting:
3180 if (LT.second.getScalarType() == MVT::i8)
3181 Cost *= 8;
3182 else if (LT.second.getScalarType() == MVT::i16)
3183 Cost *= 4;
3184 return Cost;
3185 } else {
3186 // If one of the operands is a uniform constant then the cost for each
3187 // element is Cost for insertion, extraction and division.
3188 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3189 // operation with scalar type
3190 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3191 (Op2Info.isConstant() && Op2Info.isUniform())) {
3192 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3194 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3195 return (4 + DivCost) * VTy->getNumElements();
3196 }
3197 }
3198 // On AArch64, without SVE, vector divisions are expanded
3199 // into scalar divisions of each pair of elements.
3200 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3201 CostKind, Op1Info, Op2Info);
3202 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3203 Op1Info, Op2Info);
3204 }
3205
3206 // TODO: if one of the arguments is scalar, then it's not necessary to
3207 // double the cost of handling the vector elements.
3208 Cost += Cost;
3209 }
3210 return Cost;
3211 }
3212 case ISD::MUL:
3213 // When SVE is available, then we can lower the v2i64 operation using
3214 // the SVE mul instruction, which has a lower cost.
3215 if (LT.second == MVT::v2i64 && ST->hasSVE())
3216 return LT.first;
3217
3218 // When SVE is not available, there is no MUL.2d instruction,
3219 // which means mul <2 x i64> is expensive as elements are extracted
3220 // from the vectors and the muls scalarized.
3221 // As getScalarizationOverhead is a bit too pessimistic, we
3222 // estimate the cost for a i64 vector directly here, which is:
3223 // - four 2-cost i64 extracts,
3224 // - two 2-cost i64 inserts, and
3225 // - two 1-cost muls.
3226 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3227 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3228 // need to scalarize so the cost can be cheaper (smull or umull).
3229 // so the cost can be cheaper (smull or umull).
3230 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3231 return LT.first;
3232 return LT.first * 14;
3233 case ISD::ADD:
3234 case ISD::XOR:
3235 case ISD::OR:
3236 case ISD::AND:
3237 case ISD::SRL:
3238 case ISD::SRA:
3239 case ISD::SHL:
3240 // These nodes are marked as 'custom' for combining purposes only.
3241 // We know that they are legal. See LowerAdd in ISelLowering.
3242 return LT.first;
3243
3244 case ISD::FNEG:
3245 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
3246 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
3247 (Ty->isHalfTy() && ST->hasFullFP16())) &&
3248 CxtI &&
3249 ((CxtI->hasOneUse() &&
3250 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
3251 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
3252 return 0;
3253 [[fallthrough]];
3254 case ISD::FADD:
3255 case ISD::FSUB:
3256 // Increase the cost for half and bfloat types if not architecturally
3257 // supported.
3258 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3259 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3260 return 2 * LT.first;
3261 if (!Ty->getScalarType()->isFP128Ty())
3262 return LT.first;
3263 [[fallthrough]];
3264 case ISD::FMUL:
3265 case ISD::FDIV:
3266 // These nodes are marked as 'custom' just to lower them to SVE.
3267 // We know said lowering will incur no additional cost.
3268 if (!Ty->getScalarType()->isFP128Ty())
3269 return 2 * LT.first;
3270
3271 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3272 Op2Info);
3273 case ISD::FREM:
3274 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3275 // those functions are not declared in the module.
3276 if (!Ty->isVectorTy())
3277 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3278 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3279 Op2Info);
3280 }
3281}
3282
3284 ScalarEvolution *SE,
3285 const SCEV *Ptr) {
3286 // Address computations in vectorized code with non-consecutive addresses will
3287 // likely result in more instructions compared to scalar code where the
3288 // computation can more often be merged into the index mode. The resulting
3289 // extra micro-ops can significantly decrease throughput.
3290 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3291 int MaxMergeDistance = 64;
3292
3293 if (Ty->isVectorTy() && SE &&
3294 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3295 return NumVectorInstToHideOverhead;
3296
3297 // In many cases the address computation is not merged into the instruction
3298 // addressing mode.
3299 return 1;
3300}
3301
3303 Type *CondTy,
3304 CmpInst::Predicate VecPred,
3306 const Instruction *I) {
3307 // TODO: Handle other cost kinds.
3309 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3310 I);
3311
3312 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3313 // We don't lower some vector selects well that are wider than the register
3314 // width.
3315 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3316 // We would need this many instructions to hide the scalarization happening.
3317 const int AmortizationCost = 20;
3318
3319 // If VecPred is not set, check if we can get a predicate from the context
3320 // instruction, if its type matches the requested ValTy.
3321 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3322 CmpInst::Predicate CurrentPred;
3323 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3324 m_Value())))
3325 VecPred = CurrentPred;
3326 }
3327 // Check if we have a compare/select chain that can be lowered using
3328 // a (F)CMxx & BFI pair.
3329 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3330 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3331 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3332 VecPred == CmpInst::FCMP_UNE) {
3333 static const auto ValidMinMaxTys = {
3334 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3335 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3336 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3337
3338 auto LT = getTypeLegalizationCost(ValTy);
3339 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3340 (ST->hasFullFP16() &&
3341 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3342 return LT.first;
3343 }
3344
3345 static const TypeConversionCostTblEntry
3346 VectorSelectTbl[] = {
3347 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3348 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3349 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3350 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3351 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3352 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3353 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3354 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3355 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3356 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3357 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3358 };
3359
3360 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3361 EVT SelValTy = TLI->getValueType(DL, ValTy);
3362 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3363 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3364 SelCondTy.getSimpleVT(),
3365 SelValTy.getSimpleVT()))
3366 return Entry->Cost;
3367 }
3368 }
3369
3370 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3371 auto LT = getTypeLegalizationCost(ValTy);
3372 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3373 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3374 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3375 }
3376
3377 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3378 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3379 // be profitable.
3380 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3381 ICmpInst::isEquality(VecPred) &&
3382 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3383 match(I->getOperand(1), m_Zero()) &&
3384 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3385 return 0;
3386
3387 // The base case handles scalable vectors fine for now, since it treats the
3388 // cost as 1 * legalization cost.
3389 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3390}
3391
3393AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3395 if (ST->requiresStrictAlign()) {
3396 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3397 // a bunch of instructions when strict align is enabled.
3398 return Options;
3399 }
3400 Options.AllowOverlappingLoads = true;
3401 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3402 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3403 // TODO: Though vector loads usually perform well on AArch64, in some targets
3404 // they may wake up the FP unit, which raises the power consumption. Perhaps
3405 // they could be used with no holds barred (-O3).
3406 Options.LoadSizes = {8, 4, 2, 1};
3407 Options.AllowedTailExpansions = {3, 5, 6};
3408 return Options;
3409}
3410
3412 return ST->hasSVE();
3413}
3414
3417 Align Alignment, unsigned AddressSpace,
3419 if (useNeonVector(Src))
3420 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3421 CostKind);
3422 auto LT = getTypeLegalizationCost(Src);
3423 if (!LT.first.isValid())
3425
3426 // Return an invalid cost for element types that we are unable to lower.
3427 auto *VT = cast<VectorType>(Src);
3428 if (VT->getElementType()->isIntegerTy(1))
3430
3431 // The code-generator is currently not able to handle scalable vectors
3432 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3433 // it. This change will be removed when code-generation for these types is
3434 // sufficiently reliable.
3435 if (VT->getElementCount() == ElementCount::getScalable(1))
3437
3438 return LT.first;
3439}
3440
3441// This function returns gather/scatter overhead either from
3442// user-provided value or specialized values per-target from \p ST.
3443static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
3444 const AArch64Subtarget *ST) {
3445 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
3446 "Should be called on only load or stores.");
3447 switch (Opcode) {
3448 case Instruction::Load:
3449 if (SVEGatherOverhead.getNumOccurrences() > 0)
3450 return SVEGatherOverhead;
3451 return ST->getGatherOverhead();
3452 break;
3453 case Instruction::Store:
3454 if (SVEScatterOverhead.getNumOccurrences() > 0)
3455 return SVEScatterOverhead;
3456 return ST->getScatterOverhead();
3457 break;
3458 default:
3459 llvm_unreachable("Shouldn't have reached here");
3460 }
3461}
3462
3464 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3465 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3466 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3467 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3468 Alignment, CostKind, I);
3469 auto *VT = cast<VectorType>(DataTy);
3470 auto LT = getTypeLegalizationCost(DataTy);
3471 if (!LT.first.isValid())
3473
3474 // Return an invalid cost for element types that we are unable to lower.
3475 if (!LT.second.isVector() ||
3476 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3477 VT->getElementType()->isIntegerTy(1))
3479
3480 // The code-generator is currently not able to handle scalable vectors
3481 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3482 // it. This change will be removed when code-generation for these types is
3483 // sufficiently reliable.
3484 if (VT->getElementCount() == ElementCount::getScalable(1))
3486
3487 ElementCount LegalVF = LT.second.getVectorElementCount();
3488 InstructionCost MemOpCost =
3489 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3490 {TTI::OK_AnyValue, TTI::OP_None}, I);
3491 // Add on an overhead cost for using gathers/scatters.
3492 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
3493 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3494}
3495
3497 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3498}
3499
3501 MaybeAlign Alignment,
3502 unsigned AddressSpace,
3504 TTI::OperandValueInfo OpInfo,
3505 const Instruction *I) {
3506 EVT VT = TLI->getValueType(DL, Ty, true);
3507 // Type legalization can't handle structs
3508 if (VT == MVT::Other)
3509 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3510 CostKind);
3511
3512 auto LT = getTypeLegalizationCost(Ty);
3513 if (!LT.first.isValid())
3515
3516 // The code-generator is currently not able to handle scalable vectors
3517 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3518 // it. This change will be removed when code-generation for these types is
3519 // sufficiently reliable.
3520 // We also only support full register predicate loads and stores.
3521 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3522 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3523 (VTy->getElementType()->isIntegerTy(1) &&
3524 !VTy->getElementCount().isKnownMultipleOf(
3527
3528 // TODO: consider latency as well for TCK_SizeAndLatency.
3530 return LT.first;
3531
3533 return 1;
3534
3535 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3536 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3537 // Unaligned stores are extremely inefficient. We don't split all
3538 // unaligned 128-bit stores because the negative impact that has shown in
3539 // practice on inlined block copy code.
3540 // We make such stores expensive so that we will only vectorize if there
3541 // are 6 other instructions getting vectorized.
3542 const int AmortizationCost = 6;
3543
3544 return LT.first * 2 * AmortizationCost;
3545 }
3546
3547 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3548 if (Ty->isPtrOrPtrVectorTy())
3549 return LT.first;
3550
3551 if (useNeonVector(Ty)) {
3552 // Check truncating stores and extending loads.
3553 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3554 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3555 if (VT == MVT::v4i8)
3556 return 2;
3557 // Otherwise we need to scalarize.
3558 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3559 }
3560 EVT EltVT = VT.getVectorElementType();
3561 unsigned EltSize = EltVT.getScalarSizeInBits();
3562 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3563 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3564 *Alignment != Align(1))
3565 return LT.first;
3566 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3567 // widening to v4i8, which produces suboptimal results.
3568 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3569 return LT.first;
3570
3571 // Check non-power-of-2 loads/stores for legal vector element types with
3572 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3573 // operations on smaller power-of-2 ops, including ld1/st1.
3574 LLVMContext &C = Ty->getContext();
3576 SmallVector<EVT> TypeWorklist;
3577 TypeWorklist.push_back(VT);
3578 while (!TypeWorklist.empty()) {
3579 EVT CurrVT = TypeWorklist.pop_back_val();
3580 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3581 if (isPowerOf2_32(CurrNumElements)) {
3582 Cost += 1;
3583 continue;
3584 }
3585
3586 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3587 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3588 TypeWorklist.push_back(
3589 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3590 }
3591 return Cost;
3592 }
3593
3594 return LT.first;
3595}
3596
3598 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3599 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3600 bool UseMaskForCond, bool UseMaskForGaps) {
3601 assert(Factor >= 2 && "Invalid interleave factor");
3602 auto *VecVTy = cast<VectorType>(VecTy);
3603
3604 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3606
3607 // Vectorization for masked interleaved accesses is only enabled for scalable
3608 // VF.
3609 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3611
3612 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3613 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3614 auto *SubVecTy =
3615 VectorType::get(VecVTy->getElementType(),
3616 VecVTy->getElementCount().divideCoefficientBy(Factor));
3617
3618 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3619 // Accesses having vector types that are a multiple of 128 bits can be
3620 // matched to more than one ldN/stN instruction.
3621 bool UseScalable;
3622 if (MinElts % Factor == 0 &&
3623 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3624 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3625 }
3626
3627 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3628 Alignment, AddressSpace, CostKind,
3629 UseMaskForCond, UseMaskForGaps);
3630}
3631
3636 for (auto *I : Tys) {
3637 if (!I->isVectorTy())
3638 continue;
3639 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3640 128)
3641 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3642 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3643 }
3644 return Cost;
3645}
3646
3648 return ST->getMaxInterleaveFactor();
3649}
3650
3651// For Falkor, we want to avoid having too many strided loads in a loop since
3652// that can exhaust the HW prefetcher resources. We adjust the unroller
3653// MaxCount preference below to attempt to ensure unrolling doesn't create too
3654// many strided loads.
3655static void
3658 enum { MaxStridedLoads = 7 };
3659 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3660 int StridedLoads = 0;
3661 // FIXME? We could make this more precise by looking at the CFG and
3662 // e.g. not counting loads in each side of an if-then-else diamond.
3663 for (const auto BB : L->blocks()) {
3664 for (auto &I : *BB) {
3665 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3666 if (!LMemI)
3667 continue;
3668
3669 Value *PtrValue = LMemI->getPointerOperand();
3670 if (L->isLoopInvariant(PtrValue))
3671 continue;
3672
3673 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3674 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3675 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3676 continue;
3677
3678 // FIXME? We could take pairing of unrolled load copies into account
3679 // by looking at the AddRec, but we would probably have to limit this
3680 // to loops with no stores or other memory optimization barriers.
3681 ++StridedLoads;
3682 // We've seen enough strided loads that seeing more won't make a
3683 // difference.
3684 if (StridedLoads > MaxStridedLoads / 2)
3685 return StridedLoads;
3686 }
3687 }
3688 return StridedLoads;
3689 };
3690
3691 int StridedLoads = countStridedLoads(L, SE);
3692 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3693 << " strided loads\n");
3694 // Pick the largest power of 2 unroll count that won't result in too many
3695 // strided loads.
3696 if (StridedLoads) {
3697 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3698 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3699 << UP.MaxCount << '\n');
3700 }
3701}
3702
3706 // Enable partial unrolling and runtime unrolling.
3707 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3708
3709 UP.UpperBound = true;
3710
3711 // For inner loop, it is more likely to be a hot one, and the runtime check
3712 // can be promoted out from LICM pass, so the overhead is less, let's try
3713 // a larger threshold to unroll more loops.
3714 if (L->getLoopDepth() > 1)
3715 UP.PartialThreshold *= 2;
3716
3717 // Disable partial & runtime unrolling on -Os.
3719
3720 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3723
3724 // Scan the loop: don't unroll loops with calls as this could prevent
3725 // inlining. Don't unroll vector loops either, as they don't benefit much from
3726 // unrolling.
3727 for (auto *BB : L->getBlocks()) {
3728 for (auto &I : *BB) {
3729 // Don't unroll vectorised loop.
3730 if (I.getType()->isVectorTy())
3731 return;
3732
3733 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3734 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3735 if (!isLoweredToCall(F))
3736 continue;
3737 }
3738 return;
3739 }
3740 }
3741 }
3742
3743 // Enable runtime unrolling for in-order models
3744 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3745 // checking for that case, we can ensure that the default behaviour is
3746 // unchanged
3748 !ST->getSchedModel().isOutOfOrder()) {
3749 UP.Runtime = true;
3750 UP.Partial = true;
3751 UP.UnrollRemainder = true;
3753
3754 UP.UnrollAndJam = true;
3756 }
3757}
3758
3762}
3763
3765 Type *ExpectedType) {
3766 switch (Inst->getIntrinsicID()) {
3767 default:
3768 return nullptr;
3769 case Intrinsic::aarch64_neon_st2:
3770 case Intrinsic::aarch64_neon_st3:
3771 case Intrinsic::aarch64_neon_st4: {
3772 // Create a struct type
3773 StructType *ST = dyn_cast<StructType>(ExpectedType);
3774 if (!ST)
3775 return nullptr;
3776 unsigned NumElts = Inst->arg_size() - 1;
3777 if (ST->getNumElements() != NumElts)
3778 return nullptr;
3779 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3780 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3781 return nullptr;
3782 }
3783 Value *Res = PoisonValue::get(ExpectedType);
3784 IRBuilder<> Builder(Inst);
3785 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3786 Value *L = Inst->getArgOperand(i);
3787 Res = Builder.CreateInsertValue(Res, L, i);
3788 }
3789 return Res;
3790 }
3791 case Intrinsic::aarch64_neon_ld2:
3792 case Intrinsic::aarch64_neon_ld3:
3793 case Intrinsic::aarch64_neon_ld4:
3794 if (Inst->getType() == ExpectedType)
3795 return Inst;
3796 return nullptr;
3797 }
3798}
3799
3801 MemIntrinsicInfo &Info) {
3802 switch (Inst->getIntrinsicID()) {
3803 default:
3804 break;
3805 case Intrinsic::aarch64_neon_ld2:
3806 case Intrinsic::aarch64_neon_ld3:
3807 case Intrinsic::aarch64_neon_ld4:
3808 Info.ReadMem = true;
3809 Info.WriteMem = false;
3810 Info.PtrVal = Inst->getArgOperand(0);
3811 break;
3812 case Intrinsic::aarch64_neon_st2:
3813 case Intrinsic::aarch64_neon_st3:
3814 case Intrinsic::aarch64_neon_st4:
3815 Info.ReadMem = false;
3816 Info.WriteMem = true;
3817 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3818 break;
3819 }
3820
3821 switch (Inst->getIntrinsicID()) {
3822 default:
3823 return false;
3824 case Intrinsic::aarch64_neon_ld2:
3825 case Intrinsic::aarch64_neon_st2:
3826 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3827 break;
3828 case Intrinsic::aarch64_neon_ld3:
3829 case Intrinsic::aarch64_neon_st3:
3830 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3831 break;
3832 case Intrinsic::aarch64_neon_ld4:
3833 case Intrinsic::aarch64_neon_st4:
3834 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3835 break;
3836 }
3837 return true;
3838}
3839
3840/// See if \p I should be considered for address type promotion. We check if \p
3841/// I is a sext with right type and used in memory accesses. If it used in a
3842/// "complex" getelementptr, we allow it to be promoted without finding other
3843/// sext instructions that sign extended the same initial value. A getelementptr
3844/// is considered as "complex" if it has more than 2 operands.
3846 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3847 bool Considerable = false;
3848 AllowPromotionWithoutCommonHeader = false;
3849 if (!isa<SExtInst>(&I))
3850 return false;
3851 Type *ConsideredSExtType =
3852 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3853 if (I.getType() != ConsideredSExtType)
3854 return false;
3855 // See if the sext is the one with the right type and used in at least one
3856 // GetElementPtrInst.
3857 for (const User *U : I.users()) {
3858 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3859 Considerable = true;
3860 // A getelementptr is considered as "complex" if it has more than 2
3861 // operands. We will promote a SExt used in such complex GEP as we
3862 // expect some computation to be merged if they are done on 64 bits.
3863 if (GEPInst->getNumOperands() > 2) {
3864 AllowPromotionWithoutCommonHeader = true;
3865 break;
3866 }
3867 }
3868 }
3869 return Considerable;
3870}
3871
3873 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3874 if (!VF.isScalable())
3875 return true;
3876
3877 Type *Ty = RdxDesc.getRecurrenceType();
3879 return false;
3880
3881 switch (RdxDesc.getRecurrenceKind()) {
3882 case RecurKind::Add:
3883 case RecurKind::FAdd:
3884 case RecurKind::And:
3885 case RecurKind::Or:
3886 case RecurKind::Xor:
3887 case RecurKind::SMin:
3888 case RecurKind::SMax:
3889 case RecurKind::UMin:
3890 case RecurKind::UMax:
3891 case RecurKind::FMin:
3892 case RecurKind::FMax:
3893 case RecurKind::FMulAdd:
3894 case RecurKind::IAnyOf:
3895 case RecurKind::FAnyOf:
3896 return true;
3897 default:
3898 return false;
3899 }
3900}
3901
3904 FastMathFlags FMF,
3906 // The code-generator is currently not able to handle scalable vectors
3907 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3908 // it. This change will be removed when code-generation for these types is
3909 // sufficiently reliable.
3910 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3911 if (VTy->getElementCount() == ElementCount::getScalable(1))
3913
3914 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3915
3916 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3917 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3918
3919 InstructionCost LegalizationCost = 0;
3920 if (LT.first > 1) {
3921 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3922 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3923 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3924 }
3925
3926 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3927}
3928
3930 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3931 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3932 InstructionCost LegalizationCost = 0;
3933 if (LT.first > 1) {
3934 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3935 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3936 LegalizationCost *= LT.first - 1;
3937 }
3938
3939 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3940 assert(ISD && "Invalid opcode");
3941 // Add the final reduction cost for the legal horizontal reduction
3942 switch (ISD) {
3943 case ISD::ADD:
3944 case ISD::AND:
3945 case ISD::OR:
3946 case ISD::XOR:
3947 case ISD::FADD:
3948 return LegalizationCost + 2;
3949 default:
3951 }
3952}
3953
3956 std::optional<FastMathFlags> FMF,
3958 // The code-generator is currently not able to handle scalable vectors
3959 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3960 // it. This change will be removed when code-generation for these types is
3961 // sufficiently reliable.
3962 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
3963 if (VTy->getElementCount() == ElementCount::getScalable(1))
3965
3967 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3968 InstructionCost BaseCost =
3969 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3970 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3971 // end up vectorizing for more computationally intensive loops.
3972 return BaseCost + FixedVTy->getNumElements();
3973 }
3974
3975 if (Opcode != Instruction::FAdd)
3977
3978 auto *VTy = cast<ScalableVectorType>(ValTy);
3980 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3981 Cost *= getMaxNumElements(VTy->getElementCount());
3982 return Cost;
3983 }
3984
3985 if (isa<ScalableVectorType>(ValTy))
3986 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3987
3988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3989 MVT MTy = LT.second;
3990 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3991 assert(ISD && "Invalid opcode");
3992
3993 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3994 // instructions as twice a normal vector add, plus 1 for each legalization
3995 // step (LT.first). This is the only arithmetic vector reduction operation for
3996 // which we have an instruction.
3997 // OR, XOR and AND costs should match the codegen from:
3998 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3999 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
4000 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
4001 static const CostTblEntry CostTblNoPairwise[]{
4002 {ISD::ADD, MVT::v8i8, 2},
4003 {ISD::ADD, MVT::v16i8, 2},
4004 {ISD::ADD, MVT::v4i16, 2},
4005 {ISD::ADD, MVT::v8i16, 2},
4006 {ISD::ADD, MVT::v4i32, 2},
4007 {ISD::ADD, MVT::v2i64, 2},
4008 {ISD::OR, MVT::v8i8, 15},
4009 {ISD::OR, MVT::v16i8, 17},
4010 {ISD::OR, MVT::v4i16, 7},
4011 {ISD::OR, MVT::v8i16, 9},
4012 {ISD::OR, MVT::v2i32, 3},
4013 {ISD::OR, MVT::v4i32, 5},
4014 {ISD::OR, MVT::v2i64, 3},
4015 {ISD::XOR, MVT::v8i8, 15},
4016 {ISD::XOR, MVT::v16i8, 17},
4017 {ISD::XOR, MVT::v4i16, 7},
4018 {ISD::XOR, MVT::v8i16, 9},
4019 {ISD::XOR, MVT::v2i32, 3},
4020 {ISD::XOR, MVT::v4i32, 5},
4021 {ISD::XOR, MVT::v2i64, 3},
4022 {ISD::AND, MVT::v8i8, 15},
4023 {ISD::AND, MVT::v16i8, 17},
4024 {ISD::AND, MVT::v4i16, 7},
4025 {ISD::AND, MVT::v8i16, 9},
4026 {ISD::AND, MVT::v2i32, 3},
4027 {ISD::AND, MVT::v4i32, 5},
4028 {ISD::AND, MVT::v2i64, 3},
4029 };
4030 switch (ISD) {
4031 default:
4032 break;
4033 case ISD::ADD:
4034 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
4035 return (LT.first - 1) + Entry->Cost;
4036 break;
4037 case ISD::XOR:
4038 case ISD::AND:
4039 case ISD::OR:
4040 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
4041 if (!Entry)
4042 break;
4043 auto *ValVTy = cast<FixedVectorType>(ValTy);
4044 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
4045 isPowerOf2_32(ValVTy->getNumElements())) {
4046 InstructionCost ExtraCost = 0;
4047 if (LT.first != 1) {
4048 // Type needs to be split, so there is an extra cost of LT.first - 1
4049 // arithmetic ops.
4050 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
4051 MTy.getVectorNumElements());
4052 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
4053 ExtraCost *= LT.first - 1;
4054 }
4055 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
4056 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
4057 return Cost + ExtraCost;
4058 }
4059 break;
4060 }
4061 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
4062}
4063
4065 static const CostTblEntry ShuffleTbl[] = {
4066 { TTI::SK_Splice, MVT::nxv16i8, 1 },
4067 { TTI::SK_Splice, MVT::nxv8i16, 1 },
4068 { TTI::SK_Splice, MVT::nxv4i32, 1 },
4069 { TTI::SK_Splice, MVT::nxv2i64, 1 },
4070 { TTI::SK_Splice, MVT::nxv2f16, 1 },
4071 { TTI::SK_Splice, MVT::nxv4f16, 1 },
4072 { TTI::SK_Splice, MVT::nxv8f16, 1 },
4073 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
4074 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
4075 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
4076 { TTI::SK_Splice, MVT::nxv2f32, 1 },
4077 { TTI::SK_Splice, MVT::nxv4f32, 1 },
4078 { TTI::SK_Splice, MVT::nxv2f64, 1 },
4079 };
4080
4081 // The code-generator is currently not able to handle scalable vectors
4082 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4083 // it. This change will be removed when code-generation for these types is
4084 // sufficiently reliable.
4087
4088 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4089 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
4091 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4092 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
4093 : LT.second;
4094 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
4095 InstructionCost LegalizationCost = 0;
4096 if (Index < 0) {
4097 LegalizationCost =
4098 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
4100 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
4102 }
4103
4104 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4105 // Cost performed on a promoted type.
4106 if (LT.second.getScalarType() == MVT::i1) {
4107 LegalizationCost +=
4108 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
4110 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
4112 }
4113 const auto *Entry =
4114 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
4115 assert(Entry && "Illegal Type for Splice");
4116 LegalizationCost += Entry->Cost;
4117 return LegalizationCost * LT.first;
4118}
4119
4123 ArrayRef<const Value *> Args, const Instruction *CxtI) {
4124 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
4125
4126 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4127 // into smaller vectors and sum the cost of each shuffle.
4128 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4129 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4130 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4131
4132 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4133 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4134 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4135 // cost than just the load.
4136 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4139 return std::max<InstructionCost>(1, LT.first / 4);
4140
4141 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4142 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4143 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4144 // cost than just the store.
4145 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4147 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4149 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4150 return LT.first;
4151
4152 unsigned TpNumElts = Mask.size();
4153 unsigned LTNumElts = LT.second.getVectorNumElements();
4154 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4155 VectorType *NTp =
4156 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4158 for (unsigned N = 0; N < NumVecs; N++) {
4159 SmallVector<int> NMask;
4160 // Split the existing mask into chunks of size LTNumElts. Track the source
4161 // sub-vectors to ensure the result has at most 2 inputs.
4162 unsigned Source1, Source2;
4163 unsigned NumSources = 0;
4164 for (unsigned E = 0; E < LTNumElts; E++) {
4165 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4167 if (MaskElt < 0) {
4169 continue;
4170 }
4171
4172 // Calculate which source from the input this comes from and whether it
4173 // is new to us.
4174 unsigned Source = MaskElt / LTNumElts;
4175 if (NumSources == 0) {
4176 Source1 = Source;
4177 NumSources = 1;
4178 } else if (NumSources == 1 && Source != Source1) {
4179 Source2 = Source;
4180 NumSources = 2;
4181 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4182 NumSources++;
4183 }
4184
4185 // Add to the new mask. For the NumSources>2 case these are not correct,
4186 // but are only used for the modular lane number.
4187 if (Source == Source1)
4188 NMask.push_back(MaskElt % LTNumElts);
4189 else if (Source == Source2)
4190 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4191 else
4192 NMask.push_back(MaskElt % LTNumElts);
4193 }
4194 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4195 // getShuffleCost. If not then cost it using the worst case as the number
4196 // of element moves into a new vector.
4197 if (NumSources <= 2)
4198 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4200 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4201 else
4202 Cost += LTNumElts;
4203 }
4204 return Cost;
4205 }
4206
4207 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4208 // Treat extractsubvector as single op permutation.
4209 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4210 if (IsExtractSubvector && LT.second.isFixedLengthVector())
4212
4213 // Check for broadcast loads, which are supported by the LD1R instruction.
4214 // In terms of code-size, the shuffle vector is free when a load + dup get
4215 // folded into a LD1R. That's what we check and return here. For performance
4216 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4217 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4218 // that we model the load + dup sequence slightly higher because LD1R is a
4219 // high latency instruction.
4220 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4221 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4222 if (IsLoad && LT.second.isVector() &&
4224 LT.second.getVectorElementCount()))
4225 return 0;
4226 }
4227
4228 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4229 // from the perfect shuffle tables.
4230 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4231 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4232 all_of(Mask, [](int E) { return E < 8; }))
4233 return getPerfectShuffleCost(Mask);
4234
4235 // Check for identity masks, which we can treat as free.
4236 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4237 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4238 all_of(enumerate(Mask), [](const auto &M) {
4239 return M.value() < 0 || M.value() == (int)M.index();
4240 }))
4241 return 0;
4242
4243 // Check for other shuffles that are not SK_ kinds but we have native
4244 // instructions for, for example ZIP and UZP.
4245 unsigned Unused;
4246 if (LT.second.isFixedLengthVector() &&
4247 LT.second.getVectorNumElements() == Mask.size() &&
4248 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4249 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4250 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4251 // Check for non-zero lane splats
4252 all_of(drop_begin(Mask),
4253 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4254 return 1;
4255
4256 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4257 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4258 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4259 static const CostTblEntry ShuffleTbl[] = {
4260 // Broadcast shuffle kinds can be performed with 'dup'.
4261 {TTI::SK_Broadcast, MVT::v8i8, 1},
4262 {TTI::SK_Broadcast, MVT::v16i8, 1},
4263 {TTI::SK_Broadcast, MVT::v4i16, 1},
4264 {TTI::SK_Broadcast, MVT::v8i16, 1},
4265 {TTI::SK_Broadcast, MVT::v2i32, 1},
4266 {TTI::SK_Broadcast, MVT::v4i32, 1},
4267 {TTI::SK_Broadcast, MVT::v2i64, 1},
4268 {TTI::SK_Broadcast, MVT::v4f16, 1},
4269 {TTI::SK_Broadcast, MVT::v8f16, 1},
4270 {TTI::SK_Broadcast, MVT::v2f32, 1},
4271 {TTI::SK_Broadcast, MVT::v4f32, 1},
4272 {TTI::SK_Broadcast, MVT::v2f64, 1},
4273 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4274 // 'zip1/zip2' instructions.
4275 {TTI::SK_Transpose, MVT::v8i8, 1},
4276 {TTI::SK_Transpose, MVT::v16i8, 1},
4277 {TTI::SK_Transpose, MVT::v4i16, 1},
4278 {TTI::SK_Transpose, MVT::v8i16, 1},
4279 {TTI::SK_Transpose, MVT::v2i32, 1},
4280 {TTI::SK_Transpose, MVT::v4i32, 1},
4281 {TTI::SK_Transpose, MVT::v2i64, 1},
4282 {TTI::SK_Transpose, MVT::v4f16, 1},
4283 {TTI::SK_Transpose, MVT::v8f16, 1},
4284 {TTI::SK_Transpose, MVT::v2f32, 1},
4285 {TTI::SK_Transpose, MVT::v4f32, 1},
4286 {TTI::SK_Transpose, MVT::v2f64, 1},
4287 // Select shuffle kinds.
4288 // TODO: handle vXi8/vXi16.
4289 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4290 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4291 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4292 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4293 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4294 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4295 // PermuteSingleSrc shuffle kinds.
4296 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4297 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4298 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4299 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4300 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4301 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4302 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4303 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4304 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4305 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4306 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4307 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4308 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4309 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4310 // Reverse can be lowered with `rev`.
4311 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4312 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4313 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4314 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4315 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4316 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4317 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4318 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4319 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4320 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4321 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4322 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4323 // Splice can all be lowered as `ext`.
4324 {TTI::SK_Splice, MVT::v2i32, 1},
4325 {TTI::SK_Splice, MVT::v4i32, 1},
4326 {TTI::SK_Splice, MVT::v2i64, 1},
4327 {TTI::SK_Splice, MVT::v2f32, 1},
4328 {TTI::SK_Splice, MVT::v4f32, 1},
4329 {TTI::SK_Splice, MVT::v2f64, 1},
4330 {TTI::SK_Splice, MVT::v8f16, 1},
4331 {TTI::SK_Splice, MVT::v8bf16, 1},
4332 {TTI::SK_Splice, MVT::v8i16, 1},
4333 {TTI::SK_Splice, MVT::v16i8, 1},
4334 {TTI::SK_Splice, MVT::v4bf16, 1},
4335 {TTI::SK_Splice, MVT::v4f16, 1},
4336 {TTI::SK_Splice, MVT::v4i16, 1},
4337 {TTI::SK_Splice, MVT::v8i8, 1},
4338 // Broadcast shuffle kinds for scalable vectors
4339 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4340 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4341 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4342 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4343 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4344 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4345 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4346 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4347 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4348 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4349 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4350 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4351 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4352 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4353 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4354 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4355 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4356 // Handle the cases for vector.reverse with scalable vectors
4357 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4358 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4359 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4360 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4361 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4362 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4363 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4364 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4365 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4366 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4367 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4368 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4369 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4370 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4371 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4372 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4373 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4374 };
4375 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4376 return LT.first * Entry->Cost;
4377 }
4378
4379 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4380 return getSpliceCost(Tp, Index);
4381
4382 // Inserting a subvector can often be done with either a D, S or H register
4383 // move, so long as the inserted vector is "aligned".
4384 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4385 LT.second.getSizeInBits() <= 128 && SubTp) {
4386 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4387 if (SubLT.second.isVector()) {
4388 int NumElts = LT.second.getVectorNumElements();
4389 int NumSubElts = SubLT.second.getVectorNumElements();
4390 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4391 return SubLT.first;
4392 }
4393 }
4394
4395 // Restore optimal kind.
4396 if (IsExtractSubvector)
4398 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4399 CxtI);
4400}
4401
4404 const auto &Strides = DenseMap<Value *, const SCEV *>();
4405 for (BasicBlock *BB : TheLoop->blocks()) {
4406 // Scan the instructions in the block and look for addresses that are
4407 // consecutive and decreasing.
4408 for (Instruction &I : *BB) {
4409 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4411 Type *AccessTy = getLoadStoreType(&I);
4412 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4413 /*ShouldCheckWrap=*/false)
4414 .value_or(0) < 0)
4415 return true;
4416 }
4417 }
4418 }
4419 return false;
4420}
4421
4423 if (!ST->hasSVE())
4424 return false;
4425
4426 // We don't currently support vectorisation with interleaving for SVE - with
4427 // such loops we're better off not using tail-folding. This gives us a chance
4428 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4429 if (TFI->IAI->hasGroups())
4430 return false;
4431
4433 if (TFI->LVL->getReductionVars().size())
4434 Required |= TailFoldingOpts::Reductions;
4435 if (TFI->LVL->getFixedOrderRecurrences().size())
4436 Required |= TailFoldingOpts::Recurrences;
4437
4438 // We call this to discover whether any load/store pointers in the loop have
4439 // negative strides. This will require extra work to reverse the loop
4440 // predicate, which may be expensive.
4443 Required |= TailFoldingOpts::Reverse;
4444 if (Required == TailFoldingOpts::Disabled)
4445 Required |= TailFoldingOpts::Simple;
4446
4448 Required))
4449 return false;
4450
4451 // Don't tail-fold for tight loops where we would be better off interleaving
4452 // with an unpredicated loop.
4453 unsigned NumInsns = 0;
4454 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4455 NumInsns += BB->sizeWithoutDebug();
4456 }
4457
4458 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4459 return NumInsns >= SVETailFoldInsnThreshold;
4460}
4461
4464 StackOffset BaseOffset, bool HasBaseReg,
4465 int64_t Scale, unsigned AddrSpace) const {
4466 // Scaling factors are not free at all.
4467 // Operands | Rt Latency
4468 // -------------------------------------------
4469 // Rt, [Xn, Xm] | 4
4470 // -------------------------------------------
4471 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4472 // Rt, [Xn, Wm, <extend> #imm] |
4474 AM.BaseGV = BaseGV;
4475 AM.BaseOffs = BaseOffset.getFixed();
4476 AM.HasBaseReg = HasBaseReg;
4477 AM.Scale = Scale;
4478 AM.ScalableOffset = BaseOffset.getScalable();
4479 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4480 // Scale represents reg2 * scale, thus account for 1 if
4481 // it is not equal to 0 or 1.
4482 return AM.Scale != 0 && AM.Scale != 1;
4483 return -1;
4484}
4485
4487 // For the binary operators (e.g. or) we need to be more careful than
4488 // selects, here we only transform them if they are already at a natural
4489 // break point in the code - the end of a block with an unconditional
4490 // terminator.
4491 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4492 isa<BranchInst>(I->getNextNode()) &&
4493 cast<BranchInst>(I->getNextNode())->isUnconditional())
4494 return true;
4496}
4497
4499 const TargetTransformInfo::LSRCost &C2) {
4500 // AArch64 specific here is adding the number of instructions to the
4501 // comparison (though not as the first consideration, as some targets do)
4502 // along with changing the priority of the base additions.
4503 // TODO: Maybe a more nuanced tradeoff between instruction count
4504 // and number of registers? To be investigated at a later date.
4505 if (EnableLSRCostOpt)
4506 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4507 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4508 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4509 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4510
4512}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1627
unsigned countLeadingOnes() const
Definition: APInt.h:1581
void negate()
Negate this APInt in place.
Definition: APInt.h:1428
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1717
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:583
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:970
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:763
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:655
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:892
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:856
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:246
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isIntPredicate() const
Definition: InstrTypes.h:865
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1650
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2492
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2543
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1062
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2480
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:556
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:579
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:541
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1883
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2225
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2417
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1807
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2514
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1820
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:599
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:551
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1683
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2216
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:47
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:383
BuilderTy & Builder
Definition: InstCombiner.h:60
static InstructionCost getInvalid(CostType Val=0)
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:674
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
Value * getPointerOperand()
Definition: Instructions.h:253
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:70
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:701
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:627
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:95
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:685
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:664
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:953
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1539
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:972
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:921
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
InstructionCost Cost
@ Default
The result values are uniform if and only if all operands are uniform.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:275
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:371
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
bool isFixedLengthVector() const
Definition: ValueTypes.h:178
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:204
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:174
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:319
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:327
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55