LLVM 19.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62 cl::init(true), cl::Hidden);
63
64namespace {
65class TailFoldingOption {
66 // These bitfields will only ever be set to something non-zero in operator=,
67 // when setting the -sve-tail-folding option. This option should always be of
68 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
69 // InitialBits is one of (disabled|all|simple). EnableBits represents
70 // additional flags we're enabling, and DisableBits for those flags we're
71 // disabling. The default flag is tracked in the variable NeedsDefault, since
72 // at the time of setting the option we may not know what the default value
73 // for the CPU is.
74 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
75 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
76 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
77
78 // This value needs to be initialised to true in case the user does not
79 // explicitly set the -sve-tail-folding option.
80 bool NeedsDefault = true;
81
82 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
83
84 void setNeedsDefault(bool V) { NeedsDefault = V; }
85
86 void setEnableBit(TailFoldingOpts Bit) {
87 EnableBits |= Bit;
88 DisableBits &= ~Bit;
89 }
90
91 void setDisableBit(TailFoldingOpts Bit) {
92 EnableBits &= ~Bit;
93 DisableBits |= Bit;
94 }
95
96 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
97 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
98
99 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
100 "Initial bits should only include one of "
101 "(disabled|all|simple|default)");
102 Bits = NeedsDefault ? DefaultBits : InitialBits;
103 Bits |= EnableBits;
104 Bits &= ~DisableBits;
105
106 return Bits;
107 }
108
109 void reportError(std::string Opt) {
110 errs() << "invalid argument '" << Opt
111 << "' to -sve-tail-folding=; the option should be of the form\n"
112 " (disabled|all|default|simple)[+(reductions|recurrences"
113 "|reverse|noreductions|norecurrences|noreverse)]\n";
114 report_fatal_error("Unrecognised tail-folding option");
115 }
116
117public:
118
119 void operator=(const std::string &Val) {
120 // If the user explicitly sets -sve-tail-folding= then treat as an error.
121 if (Val.empty()) {
122 reportError("");
123 return;
124 }
125
126 // Since the user is explicitly setting the option we don't automatically
127 // need the default unless they require it.
128 setNeedsDefault(false);
129
130 SmallVector<StringRef, 4> TailFoldTypes;
131 StringRef(Val).split(TailFoldTypes, '+', -1, false);
132
133 unsigned StartIdx = 1;
134 if (TailFoldTypes[0] == "disabled")
135 setInitialBits(TailFoldingOpts::Disabled);
136 else if (TailFoldTypes[0] == "all")
137 setInitialBits(TailFoldingOpts::All);
138 else if (TailFoldTypes[0] == "default")
139 setNeedsDefault(true);
140 else if (TailFoldTypes[0] == "simple")
141 setInitialBits(TailFoldingOpts::Simple);
142 else {
143 StartIdx = 0;
144 setInitialBits(TailFoldingOpts::Disabled);
145 }
146
147 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
148 if (TailFoldTypes[I] == "reductions")
149 setEnableBit(TailFoldingOpts::Reductions);
150 else if (TailFoldTypes[I] == "recurrences")
151 setEnableBit(TailFoldingOpts::Recurrences);
152 else if (TailFoldTypes[I] == "reverse")
153 setEnableBit(TailFoldingOpts::Reverse);
154 else if (TailFoldTypes[I] == "noreductions")
155 setDisableBit(TailFoldingOpts::Reductions);
156 else if (TailFoldTypes[I] == "norecurrences")
157 setDisableBit(TailFoldingOpts::Recurrences);
158 else if (TailFoldTypes[I] == "noreverse")
159 setDisableBit(TailFoldingOpts::Reverse);
160 else
161 reportError(Val);
162 }
163 }
164
165 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
166 return (getBits(DefaultBits) & Required) == Required;
167 }
168};
169} // namespace
170
171TailFoldingOption TailFoldingOptionLoc;
172
174 "sve-tail-folding",
175 cl::desc(
176 "Control the use of vectorisation using tail-folding for SVE where the"
177 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
178 "\ndisabled (Initial) No loop types will vectorize using "
179 "tail-folding"
180 "\ndefault (Initial) Uses the default tail-folding settings for "
181 "the target CPU"
182 "\nall (Initial) All legal loop types will vectorize using "
183 "tail-folding"
184 "\nsimple (Initial) Use tail-folding for simple loops (not "
185 "reductions or recurrences)"
186 "\nreductions Use tail-folding for loops containing reductions"
187 "\nnoreductions Inverse of above"
188 "\nrecurrences Use tail-folding for loops containing fixed order "
189 "recurrences"
190 "\nnorecurrences Inverse of above"
191 "\nreverse Use tail-folding for loops requiring reversed "
192 "predicates"
193 "\nnoreverse Inverse of above"),
195
196// Experimental option that will only be fully functional when the
197// code-generator is changed to use SVE instead of NEON for all fixed-width
198// operations.
200 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
201
202// Experimental option that will only be fully functional when the cost-model
203// and code-generator have been changed to avoid using scalable vector
204// instructions that are not legal in streaming SVE mode.
206 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
207
208static bool isSMEABIRoutineCall(const CallInst &CI) {
209 const auto *F = CI.getCalledFunction();
210 return F && StringSwitch<bool>(F->getName())
211 .Case("__arm_sme_state", true)
212 .Case("__arm_tpidr2_save", true)
213 .Case("__arm_tpidr2_restore", true)
214 .Case("__arm_za_disable", true)
215 .Default(false);
216}
217
218/// Returns true if the function has explicit operations that can only be
219/// lowered using incompatible instructions for the selected mode. This also
220/// returns true if the function F may use or modify ZA state.
222 for (const BasicBlock &BB : *F) {
223 for (const Instruction &I : BB) {
224 // Be conservative for now and assume that any call to inline asm or to
225 // intrinsics could could result in non-streaming ops (e.g. calls to
226 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
227 // all native LLVM instructions can be lowered to compatible instructions.
228 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
229 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
230 isSMEABIRoutineCall(cast<CallInst>(I))))
231 return true;
232 }
233 }
234 return false;
235}
236
238 const Function *Callee) const {
239 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
240
241 // When inlining, we should consider the body of the function, not the
242 // interface.
243 if (CalleeAttrs.hasStreamingBody()) {
244 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
245 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
246 }
247
248 if (CalleeAttrs.isNewZA())
249 return false;
250
251 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
252 CallerAttrs.requiresSMChange(CalleeAttrs)) {
253 if (hasPossibleIncompatibleOps(Callee))
254 return false;
255 }
256
257 const TargetMachine &TM = getTLI()->getTargetMachine();
258
259 const FeatureBitset &CallerBits =
260 TM.getSubtargetImpl(*Caller)->getFeatureBits();
261 const FeatureBitset &CalleeBits =
262 TM.getSubtargetImpl(*Callee)->getFeatureBits();
263
264 // Inline a callee if its target-features are a subset of the callers
265 // target-features.
266 return (CallerBits & CalleeBits) == CalleeBits;
267}
268
270 const Function *Caller, const Function *Callee,
271 const ArrayRef<Type *> &Types) const {
272 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
273 return false;
274
275 // We need to ensure that argument promotion does not attempt to promote
276 // pointers to fixed-length vector types larger than 128 bits like
277 // <8 x float> (and pointers to aggregate types which have such fixed-length
278 // vector type members) into the values of the pointees. Such vector types
279 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
280 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
281 // types can be safely treated as 128-bit NEON types and they cannot be
282 // distinguished in IR.
283 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
284 auto FVTy = dyn_cast<FixedVectorType>(Ty);
285 return FVTy &&
286 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
287 }))
288 return false;
289
290 return true;
291}
292
293unsigned
295 unsigned DefaultCallPenalty) const {
296 // This function calculates a penalty for executing Call in F.
297 //
298 // There are two ways this function can be called:
299 // (1) F:
300 // call from F -> G (the call here is Call)
301 //
302 // For (1), Call.getCaller() == F, so it will always return a high cost if
303 // a streaming-mode change is required (thus promoting the need to inline the
304 // function)
305 //
306 // (2) F:
307 // call from F -> G (the call here is not Call)
308 // G:
309 // call from G -> H (the call here is Call)
310 //
311 // For (2), if after inlining the body of G into F the call to H requires a
312 // streaming-mode change, and the call to G from F would also require a
313 // streaming-mode change, then there is benefit to do the streaming-mode
314 // change only once and avoid inlining of G into F.
315 SMEAttrs FAttrs(*F);
316 SMEAttrs CalleeAttrs(Call);
317 if (FAttrs.requiresSMChange(CalleeAttrs)) {
318 if (F == Call.getCaller()) // (1)
319 return CallPenaltyChangeSM * DefaultCallPenalty;
320 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
321 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
322 }
323
324 return DefaultCallPenalty;
325}
326
331 ST->isNeonAvailable());
332}
333
334/// Calculate the cost of materializing a 64-bit value. This helper
335/// method might only calculate a fraction of a larger immediate. Therefore it
336/// is valid to return a cost of ZERO.
338 // Check if the immediate can be encoded within an instruction.
339 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
340 return 0;
341
342 if (Val < 0)
343 Val = ~Val;
344
345 // Calculate how many moves we will need to materialize this constant.
348 return Insn.size();
349}
350
351/// Calculate the cost of materializing the given constant.
354 assert(Ty->isIntegerTy());
355
356 unsigned BitSize = Ty->getPrimitiveSizeInBits();
357 if (BitSize == 0)
358 return ~0U;
359
360 // Sign-extend all constants to a multiple of 64-bit.
361 APInt ImmVal = Imm;
362 if (BitSize & 0x3f)
363 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
364
365 // Split the constant into 64-bit chunks and calculate the cost for each
366 // chunk.
368 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
369 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
370 int64_t Val = Tmp.getSExtValue();
371 Cost += getIntImmCost(Val);
372 }
373 // We need at least one instruction to materialze the constant.
374 return std::max<InstructionCost>(1, Cost);
375}
376
378 const APInt &Imm, Type *Ty,
380 Instruction *Inst) {
381 assert(Ty->isIntegerTy());
382
383 unsigned BitSize = Ty->getPrimitiveSizeInBits();
384 // There is no cost model for constants with a bit size of 0. Return TCC_Free
385 // here, so that constant hoisting will ignore this constant.
386 if (BitSize == 0)
387 return TTI::TCC_Free;
388
389 unsigned ImmIdx = ~0U;
390 switch (Opcode) {
391 default:
392 return TTI::TCC_Free;
393 case Instruction::GetElementPtr:
394 // Always hoist the base address of a GetElementPtr.
395 if (Idx == 0)
396 return 2 * TTI::TCC_Basic;
397 return TTI::TCC_Free;
398 case Instruction::Store:
399 ImmIdx = 0;
400 break;
401 case Instruction::Add:
402 case Instruction::Sub:
403 case Instruction::Mul:
404 case Instruction::UDiv:
405 case Instruction::SDiv:
406 case Instruction::URem:
407 case Instruction::SRem:
408 case Instruction::And:
409 case Instruction::Or:
410 case Instruction::Xor:
411 case Instruction::ICmp:
412 ImmIdx = 1;
413 break;
414 // Always return TCC_Free for the shift value of a shift instruction.
415 case Instruction::Shl:
416 case Instruction::LShr:
417 case Instruction::AShr:
418 if (Idx == 1)
419 return TTI::TCC_Free;
420 break;
421 case Instruction::Trunc:
422 case Instruction::ZExt:
423 case Instruction::SExt:
424 case Instruction::IntToPtr:
425 case Instruction::PtrToInt:
426 case Instruction::BitCast:
427 case Instruction::PHI:
428 case Instruction::Call:
429 case Instruction::Select:
430 case Instruction::Ret:
431 case Instruction::Load:
432 break;
433 }
434
435 if (Idx == ImmIdx) {
436 int NumConstants = (BitSize + 63) / 64;
438 return (Cost <= NumConstants * TTI::TCC_Basic)
439 ? static_cast<int>(TTI::TCC_Free)
440 : Cost;
441 }
443}
444
447 const APInt &Imm, Type *Ty,
449 assert(Ty->isIntegerTy());
450
451 unsigned BitSize = Ty->getPrimitiveSizeInBits();
452 // There is no cost model for constants with a bit size of 0. Return TCC_Free
453 // here, so that constant hoisting will ignore this constant.
454 if (BitSize == 0)
455 return TTI::TCC_Free;
456
457 // Most (all?) AArch64 intrinsics do not support folding immediates into the
458 // selected instruction, so we compute the materialization cost for the
459 // immediate directly.
460 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
462
463 switch (IID) {
464 default:
465 return TTI::TCC_Free;
466 case Intrinsic::sadd_with_overflow:
467 case Intrinsic::uadd_with_overflow:
468 case Intrinsic::ssub_with_overflow:
469 case Intrinsic::usub_with_overflow:
470 case Intrinsic::smul_with_overflow:
471 case Intrinsic::umul_with_overflow:
472 if (Idx == 1) {
473 int NumConstants = (BitSize + 63) / 64;
475 return (Cost <= NumConstants * TTI::TCC_Basic)
476 ? static_cast<int>(TTI::TCC_Free)
477 : Cost;
478 }
479 break;
480 case Intrinsic::experimental_stackmap:
481 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
482 return TTI::TCC_Free;
483 break;
484 case Intrinsic::experimental_patchpoint_void:
485 case Intrinsic::experimental_patchpoint:
486 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
487 return TTI::TCC_Free;
488 break;
489 case Intrinsic::experimental_gc_statepoint:
490 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
491 return TTI::TCC_Free;
492 break;
493 }
495}
496
499 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
500 if (TyWidth == 32 || TyWidth == 64)
502 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
503 return TTI::PSK_Software;
504}
505
506static bool isUnpackedVectorVT(EVT VecVT) {
507 return VecVT.isScalableVector() &&
509}
510
514 auto *RetTy = ICA.getReturnType();
515 switch (ICA.getID()) {
516 case Intrinsic::umin:
517 case Intrinsic::umax:
518 case Intrinsic::smin:
519 case Intrinsic::smax: {
520 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
521 MVT::v8i16, MVT::v2i32, MVT::v4i32,
522 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
523 MVT::nxv2i64};
525 // v2i64 types get converted to cmp+bif hence the cost of 2
526 if (LT.second == MVT::v2i64)
527 return LT.first * 2;
528 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
529 return LT.first;
530 break;
531 }
532 case Intrinsic::sadd_sat:
533 case Intrinsic::ssub_sat:
534 case Intrinsic::uadd_sat:
535 case Intrinsic::usub_sat: {
536 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
537 MVT::v8i16, MVT::v2i32, MVT::v4i32,
538 MVT::v2i64};
540 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
541 // need to extend the type, as it uses shr(qadd(shl, shl)).
542 unsigned Instrs =
543 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
544 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
545 return LT.first * Instrs;
546 break;
547 }
548 case Intrinsic::abs: {
549 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
550 MVT::v8i16, MVT::v2i32, MVT::v4i32,
551 MVT::v2i64};
553 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
554 return LT.first;
555 break;
556 }
557 case Intrinsic::bswap: {
558 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
559 MVT::v4i32, MVT::v2i64};
561 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
562 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
563 return LT.first;
564 break;
565 }
566 case Intrinsic::experimental_stepvector: {
567 InstructionCost Cost = 1; // Cost of the `index' instruction
569 // Legalisation of illegal vectors involves an `index' instruction plus
570 // (LT.first - 1) vector adds.
571 if (LT.first > 1) {
572 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
573 InstructionCost AddCost =
574 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
575 Cost += AddCost * (LT.first - 1);
576 }
577 return Cost;
578 }
579 case Intrinsic::vector_extract:
580 case Intrinsic::vector_insert: {
581 // If both the vector and subvector types are legal types and the index
582 // is 0, then this should be a no-op or simple operation; return a
583 // relatively low cost.
584
585 // If arguments aren't actually supplied, then we cannot determine the
586 // value of the index. We also want to skip predicate types.
587 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
589 break;
590
591 LLVMContext &C = RetTy->getContext();
592 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
593 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
594 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
595 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
596 // Skip this if either the vector or subvector types are unpacked
597 // SVE types; they may get lowered to stack stores and loads.
598 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
599 break;
600
602 getTLI()->getTypeConversion(C, SubVecVT);
604 getTLI()->getTypeConversion(C, VecVT);
605 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
606 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
607 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
608 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
609 return TTI::TCC_Free;
610 break;
611 }
612 case Intrinsic::bitreverse: {
613 static const CostTblEntry BitreverseTbl[] = {
614 {Intrinsic::bitreverse, MVT::i32, 1},
615 {Intrinsic::bitreverse, MVT::i64, 1},
616 {Intrinsic::bitreverse, MVT::v8i8, 1},
617 {Intrinsic::bitreverse, MVT::v16i8, 1},
618 {Intrinsic::bitreverse, MVT::v4i16, 2},
619 {Intrinsic::bitreverse, MVT::v8i16, 2},
620 {Intrinsic::bitreverse, MVT::v2i32, 2},
621 {Intrinsic::bitreverse, MVT::v4i32, 2},
622 {Intrinsic::bitreverse, MVT::v1i64, 2},
623 {Intrinsic::bitreverse, MVT::v2i64, 2},
624 };
625 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
626 const auto *Entry =
627 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
628 if (Entry) {
629 // Cost Model is using the legal type(i32) that i8 and i16 will be
630 // converted to +1 so that we match the actual lowering cost
631 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
632 TLI->getValueType(DL, RetTy, true) == MVT::i16)
633 return LegalisationCost.first * Entry->Cost + 1;
634
635 return LegalisationCost.first * Entry->Cost;
636 }
637 break;
638 }
639 case Intrinsic::ctpop: {
640 if (!ST->hasNEON()) {
641 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
642 return getTypeLegalizationCost(RetTy).first * 12;
643 }
644 static const CostTblEntry CtpopCostTbl[] = {
645 {ISD::CTPOP, MVT::v2i64, 4},
646 {ISD::CTPOP, MVT::v4i32, 3},
647 {ISD::CTPOP, MVT::v8i16, 2},
648 {ISD::CTPOP, MVT::v16i8, 1},
649 {ISD::CTPOP, MVT::i64, 4},
650 {ISD::CTPOP, MVT::v2i32, 3},
651 {ISD::CTPOP, MVT::v4i16, 2},
652 {ISD::CTPOP, MVT::v8i8, 1},
653 {ISD::CTPOP, MVT::i32, 5},
654 };
656 MVT MTy = LT.second;
657 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
658 // Extra cost of +1 when illegal vector types are legalized by promoting
659 // the integer type.
660 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
661 RetTy->getScalarSizeInBits()
662 ? 1
663 : 0;
664 return LT.first * Entry->Cost + ExtraCost;
665 }
666 break;
667 }
668 case Intrinsic::sadd_with_overflow:
669 case Intrinsic::uadd_with_overflow:
670 case Intrinsic::ssub_with_overflow:
671 case Intrinsic::usub_with_overflow:
672 case Intrinsic::smul_with_overflow:
673 case Intrinsic::umul_with_overflow: {
674 static const CostTblEntry WithOverflowCostTbl[] = {
675 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
676 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
677 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
678 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
679 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
680 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
681 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
682 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
683 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
684 {Intrinsic::usub_with_overflow, MVT::i8, 3},
685 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
686 {Intrinsic::usub_with_overflow, MVT::i16, 3},
687 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
688 {Intrinsic::usub_with_overflow, MVT::i32, 1},
689 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
690 {Intrinsic::usub_with_overflow, MVT::i64, 1},
691 {Intrinsic::smul_with_overflow, MVT::i8, 5},
692 {Intrinsic::umul_with_overflow, MVT::i8, 4},
693 {Intrinsic::smul_with_overflow, MVT::i16, 5},
694 {Intrinsic::umul_with_overflow, MVT::i16, 4},
695 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
696 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
697 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
698 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
699 };
700 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
701 if (MTy.isSimple())
702 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
703 MTy.getSimpleVT()))
704 return Entry->Cost;
705 break;
706 }
707 case Intrinsic::fptosi_sat:
708 case Intrinsic::fptoui_sat: {
709 if (ICA.getArgTypes().empty())
710 break;
711 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
712 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
713 EVT MTy = TLI->getValueType(DL, RetTy);
714 // Check for the legal types, which are where the size of the input and the
715 // output are the same, or we are using cvt f64->i32 or f32->i64.
716 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
717 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
718 LT.second == MVT::v2f64) &&
719 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
720 (LT.second == MVT::f64 && MTy == MVT::i32) ||
721 (LT.second == MVT::f32 && MTy == MVT::i64)))
722 return LT.first;
723 // Similarly for fp16 sizes
724 if (ST->hasFullFP16() &&
725 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
726 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
727 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
728 return LT.first;
729
730 // Otherwise we use a legal convert followed by a min+max
731 if ((LT.second.getScalarType() == MVT::f32 ||
732 LT.second.getScalarType() == MVT::f64 ||
733 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
734 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
735 Type *LegalTy =
736 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
737 if (LT.second.isVector())
738 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
740 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
741 LegalTy, {LegalTy, LegalTy});
743 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
744 LegalTy, {LegalTy, LegalTy});
746 return LT.first * Cost;
747 }
748 break;
749 }
750 case Intrinsic::fshl:
751 case Intrinsic::fshr: {
752 if (ICA.getArgs().empty())
753 break;
754
755 // TODO: Add handling for fshl where third argument is not a constant.
756 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
757 if (!OpInfoZ.isConstant())
758 break;
759
760 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
761 if (OpInfoZ.isUniform()) {
762 // FIXME: The costs could be lower if the codegen is better.
763 static const CostTblEntry FshlTbl[] = {
764 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
765 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
766 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
767 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
768 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
769 // to avoid having to duplicate the costs.
770 const auto *Entry =
771 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
772 if (Entry)
773 return LegalisationCost.first * Entry->Cost;
774 }
775
776 auto TyL = getTypeLegalizationCost(RetTy);
777 if (!RetTy->isIntegerTy())
778 break;
779
780 // Estimate cost manually, as types like i8 and i16 will get promoted to
781 // i32 and CostTableLookup will ignore the extra conversion cost.
782 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
783 RetTy->getScalarSizeInBits() < 64) ||
784 (RetTy->getScalarSizeInBits() % 64 != 0);
785 unsigned ExtraCost = HigherCost ? 1 : 0;
786 if (RetTy->getScalarSizeInBits() == 32 ||
787 RetTy->getScalarSizeInBits() == 64)
788 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
789 // extr instruction.
790 else if (HigherCost)
791 ExtraCost = 1;
792 else
793 break;
794 return TyL.first + ExtraCost;
795 }
796 case Intrinsic::get_active_lane_mask: {
797 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
798 if (RetTy) {
799 EVT RetVT = getTLI()->getValueType(DL, RetTy);
800 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
801 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
802 !getTLI()->isTypeLegal(RetVT)) {
803 // We don't have enough context at this point to determine if the mask
804 // is going to be kept live after the block, which will force the vXi1
805 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
806 // For now, we just assume the vectorizer created this intrinsic and
807 // the result will be the input for a PHI. In this case the cost will
808 // be extremely high for fixed-width vectors.
809 // NOTE: getScalarizationOverhead returns a cost that's far too
810 // pessimistic for the actual generated codegen. In reality there are
811 // two instructions generated per lane.
812 return RetTy->getNumElements() * 2;
813 }
814 }
815 break;
816 }
817 default:
818 break;
819 }
821}
822
823/// The function will remove redundant reinterprets casting in the presence
824/// of the control flow
825static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
826 IntrinsicInst &II) {
828 auto RequiredType = II.getType();
829
830 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
831 assert(PN && "Expected Phi Node!");
832
833 // Don't create a new Phi unless we can remove the old one.
834 if (!PN->hasOneUse())
835 return std::nullopt;
836
837 for (Value *IncValPhi : PN->incoming_values()) {
838 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
839 if (!Reinterpret ||
840 Reinterpret->getIntrinsicID() !=
841 Intrinsic::aarch64_sve_convert_to_svbool ||
842 RequiredType != Reinterpret->getArgOperand(0)->getType())
843 return std::nullopt;
844 }
845
846 // Create the new Phi
847 IC.Builder.SetInsertPoint(PN);
848 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
849 Worklist.push_back(PN);
850
851 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
852 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
853 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
854 Worklist.push_back(Reinterpret);
855 }
856
857 // Cleanup Phi Node and reinterprets
858 return IC.replaceInstUsesWith(II, NPN);
859}
860
861// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
862// => (binop (pred) (from_svbool _) (from_svbool _))
863//
864// The above transformation eliminates a `to_svbool` in the predicate
865// operand of bitwise operation `binop` by narrowing the vector width of
866// the operation. For example, it would convert a `<vscale x 16 x i1>
867// and` into a `<vscale x 4 x i1> and`. This is profitable because
868// to_svbool must zero the new lanes during widening, whereas
869// from_svbool is free.
870static std::optional<Instruction *>
872 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
873 if (!BinOp)
874 return std::nullopt;
875
876 auto IntrinsicID = BinOp->getIntrinsicID();
877 switch (IntrinsicID) {
878 case Intrinsic::aarch64_sve_and_z:
879 case Intrinsic::aarch64_sve_bic_z:
880 case Intrinsic::aarch64_sve_eor_z:
881 case Intrinsic::aarch64_sve_nand_z:
882 case Intrinsic::aarch64_sve_nor_z:
883 case Intrinsic::aarch64_sve_orn_z:
884 case Intrinsic::aarch64_sve_orr_z:
885 break;
886 default:
887 return std::nullopt;
888 }
889
890 auto BinOpPred = BinOp->getOperand(0);
891 auto BinOpOp1 = BinOp->getOperand(1);
892 auto BinOpOp2 = BinOp->getOperand(2);
893
894 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
895 if (!PredIntr ||
896 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
897 return std::nullopt;
898
899 auto PredOp = PredIntr->getOperand(0);
900 auto PredOpTy = cast<VectorType>(PredOp->getType());
901 if (PredOpTy != II.getType())
902 return std::nullopt;
903
904 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
905 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
906 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
907 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
908 if (BinOpOp1 == BinOpOp2)
909 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
910 else
911 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
912 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
913
914 auto NarrowedBinOp =
915 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
916 return IC.replaceInstUsesWith(II, NarrowedBinOp);
917}
918
919static std::optional<Instruction *>
921 // If the reinterpret instruction operand is a PHI Node
922 if (isa<PHINode>(II.getArgOperand(0)))
923 return processPhiNode(IC, II);
924
925 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
926 return BinOpCombine;
927
928 // Ignore converts to/from svcount_t.
929 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
930 isa<TargetExtType>(II.getType()))
931 return std::nullopt;
932
933 SmallVector<Instruction *, 32> CandidatesForRemoval;
934 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
935
936 const auto *IVTy = cast<VectorType>(II.getType());
937
938 // Walk the chain of conversions.
939 while (Cursor) {
940 // If the type of the cursor has fewer lanes than the final result, zeroing
941 // must take place, which breaks the equivalence chain.
942 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
943 if (CursorVTy->getElementCount().getKnownMinValue() <
944 IVTy->getElementCount().getKnownMinValue())
945 break;
946
947 // If the cursor has the same type as I, it is a viable replacement.
948 if (Cursor->getType() == IVTy)
949 EarliestReplacement = Cursor;
950
951 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
952
953 // If this is not an SVE conversion intrinsic, this is the end of the chain.
954 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
955 Intrinsic::aarch64_sve_convert_to_svbool ||
956 IntrinsicCursor->getIntrinsicID() ==
957 Intrinsic::aarch64_sve_convert_from_svbool))
958 break;
959
960 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
961 Cursor = IntrinsicCursor->getOperand(0);
962 }
963
964 // If no viable replacement in the conversion chain was found, there is
965 // nothing to do.
966 if (!EarliestReplacement)
967 return std::nullopt;
968
969 return IC.replaceInstUsesWith(II, EarliestReplacement);
970}
971
972static bool isAllActivePredicate(Value *Pred) {
973 // Look through convert.from.svbool(convert.to.svbool(...) chain.
974 Value *UncastedPred;
975 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
976 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
977 m_Value(UncastedPred)))))
978 // If the predicate has the same or less lanes than the uncasted
979 // predicate then we know the casting has no effect.
980 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
981 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
982 Pred = UncastedPred;
983
984 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
985 m_ConstantInt<AArch64SVEPredPattern::all>()));
986}
987
988static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
989 IntrinsicInst &II) {
990 // svsel(ptrue, x, y) => x
991 auto *OpPredicate = II.getOperand(0);
992 if (isAllActivePredicate(OpPredicate))
993 return IC.replaceInstUsesWith(II, II.getOperand(1));
994
995 auto Select =
996 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
997 return IC.replaceInstUsesWith(II, Select);
998}
999
1000static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1001 IntrinsicInst &II) {
1002 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1003 if (!Pg)
1004 return std::nullopt;
1005
1006 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1007 return std::nullopt;
1008
1009 const auto PTruePattern =
1010 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1011 if (PTruePattern != AArch64SVEPredPattern::vl1)
1012 return std::nullopt;
1013
1014 // The intrinsic is inserting into lane zero so use an insert instead.
1015 auto *IdxTy = Type::getInt64Ty(II.getContext());
1016 auto *Insert = InsertElementInst::Create(
1017 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1018 Insert->insertBefore(&II);
1019 Insert->takeName(&II);
1020
1021 return IC.replaceInstUsesWith(II, Insert);
1022}
1023
1024static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1025 IntrinsicInst &II) {
1026 // Replace DupX with a regular IR splat.
1027 auto *RetTy = cast<ScalableVectorType>(II.getType());
1028 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1029 II.getArgOperand(0));
1030 Splat->takeName(&II);
1031 return IC.replaceInstUsesWith(II, Splat);
1032}
1033
1034static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1035 IntrinsicInst &II) {
1036 LLVMContext &Ctx = II.getContext();
1037
1038 // Check that the predicate is all active
1039 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1040 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1041 return std::nullopt;
1042
1043 const auto PTruePattern =
1044 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1045 if (PTruePattern != AArch64SVEPredPattern::all)
1046 return std::nullopt;
1047
1048 // Check that we have a compare of zero..
1049 auto *SplatValue =
1050 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1051 if (!SplatValue || !SplatValue->isZero())
1052 return std::nullopt;
1053
1054 // ..against a dupq
1055 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1056 if (!DupQLane ||
1057 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1058 return std::nullopt;
1059
1060 // Where the dupq is a lane 0 replicate of a vector insert
1061 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1062 return std::nullopt;
1063
1064 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1065 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1066 return std::nullopt;
1067
1068 // Where the vector insert is a fixed constant vector insert into undef at
1069 // index zero
1070 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1071 return std::nullopt;
1072
1073 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1074 return std::nullopt;
1075
1076 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1077 if (!ConstVec)
1078 return std::nullopt;
1079
1080 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1081 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1082 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1083 return std::nullopt;
1084
1085 unsigned NumElts = VecTy->getNumElements();
1086 unsigned PredicateBits = 0;
1087
1088 // Expand intrinsic operands to a 16-bit byte level predicate
1089 for (unsigned I = 0; I < NumElts; ++I) {
1090 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1091 if (!Arg)
1092 return std::nullopt;
1093 if (!Arg->isZero())
1094 PredicateBits |= 1 << (I * (16 / NumElts));
1095 }
1096
1097 // If all bits are zero bail early with an empty predicate
1098 if (PredicateBits == 0) {
1099 auto *PFalse = Constant::getNullValue(II.getType());
1100 PFalse->takeName(&II);
1101 return IC.replaceInstUsesWith(II, PFalse);
1102 }
1103
1104 // Calculate largest predicate type used (where byte predicate is largest)
1105 unsigned Mask = 8;
1106 for (unsigned I = 0; I < 16; ++I)
1107 if ((PredicateBits & (1 << I)) != 0)
1108 Mask |= (I % 8);
1109
1110 unsigned PredSize = Mask & -Mask;
1111 auto *PredType = ScalableVectorType::get(
1112 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1113
1114 // Ensure all relevant bits are set
1115 for (unsigned I = 0; I < 16; I += PredSize)
1116 if ((PredicateBits & (1 << I)) == 0)
1117 return std::nullopt;
1118
1119 auto *PTruePat =
1120 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1121 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1122 {PredType}, {PTruePat});
1123 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1124 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1125 auto *ConvertFromSVBool =
1126 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1127 {II.getType()}, {ConvertToSVBool});
1128
1129 ConvertFromSVBool->takeName(&II);
1130 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1131}
1132
1133static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1134 IntrinsicInst &II) {
1135 Value *Pg = II.getArgOperand(0);
1136 Value *Vec = II.getArgOperand(1);
1137 auto IntrinsicID = II.getIntrinsicID();
1138 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1139
1140 // lastX(splat(X)) --> X
1141 if (auto *SplatVal = getSplatValue(Vec))
1142 return IC.replaceInstUsesWith(II, SplatVal);
1143
1144 // If x and/or y is a splat value then:
1145 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1146 Value *LHS, *RHS;
1147 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1148 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1149 auto *OldBinOp = cast<BinaryOperator>(Vec);
1150 auto OpC = OldBinOp->getOpcode();
1151 auto *NewLHS =
1152 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1153 auto *NewRHS =
1154 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1156 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1157 return IC.replaceInstUsesWith(II, NewBinOp);
1158 }
1159 }
1160
1161 auto *C = dyn_cast<Constant>(Pg);
1162 if (IsAfter && C && C->isNullValue()) {
1163 // The intrinsic is extracting lane 0 so use an extract instead.
1164 auto *IdxTy = Type::getInt64Ty(II.getContext());
1165 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1166 Extract->insertBefore(&II);
1167 Extract->takeName(&II);
1168 return IC.replaceInstUsesWith(II, Extract);
1169 }
1170
1171 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1172 if (!IntrPG)
1173 return std::nullopt;
1174
1175 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1176 return std::nullopt;
1177
1178 const auto PTruePattern =
1179 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1180
1181 // Can the intrinsic's predicate be converted to a known constant index?
1182 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1183 if (!MinNumElts)
1184 return std::nullopt;
1185
1186 unsigned Idx = MinNumElts - 1;
1187 // Increment the index if extracting the element after the last active
1188 // predicate element.
1189 if (IsAfter)
1190 ++Idx;
1191
1192 // Ignore extracts whose index is larger than the known minimum vector
1193 // length. NOTE: This is an artificial constraint where we prefer to
1194 // maintain what the user asked for until an alternative is proven faster.
1195 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1196 if (Idx >= PgVTy->getMinNumElements())
1197 return std::nullopt;
1198
1199 // The intrinsic is extracting a fixed lane so use an extract instead.
1200 auto *IdxTy = Type::getInt64Ty(II.getContext());
1201 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1202 Extract->insertBefore(&II);
1203 Extract->takeName(&II);
1204 return IC.replaceInstUsesWith(II, Extract);
1205}
1206
1207static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1208 IntrinsicInst &II) {
1209 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1210 // integer variant across a variety of micro-architectures. Replace scalar
1211 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1212 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1213 // depending on the micro-architecture, but has been observed as generally
1214 // being faster, particularly when the CLAST[AB] op is a loop-carried
1215 // dependency.
1216 Value *Pg = II.getArgOperand(0);
1217 Value *Fallback = II.getArgOperand(1);
1218 Value *Vec = II.getArgOperand(2);
1219 Type *Ty = II.getType();
1220
1221 if (!Ty->isIntegerTy())
1222 return std::nullopt;
1223
1224 Type *FPTy;
1225 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1226 default:
1227 return std::nullopt;
1228 case 16:
1229 FPTy = IC.Builder.getHalfTy();
1230 break;
1231 case 32:
1232 FPTy = IC.Builder.getFloatTy();
1233 break;
1234 case 64:
1235 FPTy = IC.Builder.getDoubleTy();
1236 break;
1237 }
1238
1239 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1240 auto *FPVTy = VectorType::get(
1241 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1242 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1243 auto *FPII = IC.Builder.CreateIntrinsic(
1244 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1245 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1246 return IC.replaceInstUsesWith(II, FPIItoInt);
1247}
1248
1249static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1250 IntrinsicInst &II) {
1251 LLVMContext &Ctx = II.getContext();
1252 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1253 // can work with RDFFR_PP for ptest elimination.
1254 auto *AllPat =
1255 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1256 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1257 {II.getType()}, {AllPat});
1258 auto *RDFFR =
1259 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1260 RDFFR->takeName(&II);
1261 return IC.replaceInstUsesWith(II, RDFFR);
1262}
1263
1264static std::optional<Instruction *>
1266 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1267
1268 if (Pattern == AArch64SVEPredPattern::all) {
1269 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1270 auto *VScale = IC.Builder.CreateVScale(StepVal);
1271 VScale->takeName(&II);
1272 return IC.replaceInstUsesWith(II, VScale);
1273 }
1274
1275 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1276
1277 return MinNumElts && NumElts >= MinNumElts
1278 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1279 II, ConstantInt::get(II.getType(), MinNumElts)))
1280 : std::nullopt;
1281}
1282
1283static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1284 IntrinsicInst &II) {
1285 Value *PgVal = II.getArgOperand(0);
1286 Value *OpVal = II.getArgOperand(1);
1287
1288 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1289 // Later optimizations prefer this form.
1290 if (PgVal == OpVal &&
1291 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1292 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1293 Value *Ops[] = {PgVal, OpVal};
1294 Type *Tys[] = {PgVal->getType()};
1295
1296 auto *PTest =
1297 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1298 PTest->takeName(&II);
1299
1300 return IC.replaceInstUsesWith(II, PTest);
1301 }
1302
1303 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1304 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1305
1306 if (!Pg || !Op)
1307 return std::nullopt;
1308
1309 Intrinsic::ID OpIID = Op->getIntrinsicID();
1310
1311 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1312 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1313 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1314 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1315 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1316
1317 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1318
1319 PTest->takeName(&II);
1320 return IC.replaceInstUsesWith(II, PTest);
1321 }
1322
1323 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1324 // Later optimizations may rewrite sequence to use the flag-setting variant
1325 // of instruction X to remove PTEST.
1326 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1327 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1328 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1329 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1330 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1331 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1332 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1333 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1334 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1335 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1336 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1337 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1338 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1339 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1340 Type *Tys[] = {Pg->getType()};
1341
1342 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1343 PTest->takeName(&II);
1344
1345 return IC.replaceInstUsesWith(II, PTest);
1346 }
1347
1348 return std::nullopt;
1349}
1350
1351template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1352static std::optional<Instruction *>
1354 bool MergeIntoAddendOp) {
1355 Value *P = II.getOperand(0);
1356 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1357 if (MergeIntoAddendOp) {
1358 AddendOp = II.getOperand(1);
1359 Mul = II.getOperand(2);
1360 } else {
1361 AddendOp = II.getOperand(2);
1362 Mul = II.getOperand(1);
1363 }
1364
1365 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1366 m_Value(MulOp1))))
1367 return std::nullopt;
1368
1369 if (!Mul->hasOneUse())
1370 return std::nullopt;
1371
1372 Instruction *FMFSource = nullptr;
1373 if (II.getType()->isFPOrFPVectorTy()) {
1374 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1375 // Stop the combine when the flags on the inputs differ in case dropping
1376 // flags would lead to us missing out on more beneficial optimizations.
1377 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1378 return std::nullopt;
1379 if (!FAddFlags.allowContract())
1380 return std::nullopt;
1381 FMFSource = &II;
1382 }
1383
1384 CallInst *Res;
1385 if (MergeIntoAddendOp)
1386 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1387 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1388 else
1389 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1390 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1391
1392 return IC.replaceInstUsesWith(II, Res);
1393}
1394
1395static std::optional<Instruction *>
1397 Value *Pred = II.getOperand(0);
1398 Value *PtrOp = II.getOperand(1);
1399 Type *VecTy = II.getType();
1400
1401 if (isAllActivePredicate(Pred)) {
1402 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1403 Load->copyMetadata(II);
1404 return IC.replaceInstUsesWith(II, Load);
1405 }
1406
1407 CallInst *MaskedLoad =
1408 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1409 Pred, ConstantAggregateZero::get(VecTy));
1410 MaskedLoad->copyMetadata(II);
1411 return IC.replaceInstUsesWith(II, MaskedLoad);
1412}
1413
1414static std::optional<Instruction *>
1416 Value *VecOp = II.getOperand(0);
1417 Value *Pred = II.getOperand(1);
1418 Value *PtrOp = II.getOperand(2);
1419
1420 if (isAllActivePredicate(Pred)) {
1421 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1422 Store->copyMetadata(II);
1423 return IC.eraseInstFromFunction(II);
1424 }
1425
1426 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1427 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1428 MaskedStore->copyMetadata(II);
1429 return IC.eraseInstFromFunction(II);
1430}
1431
1433 switch (Intrinsic) {
1434 case Intrinsic::aarch64_sve_fmul_u:
1435 return Instruction::BinaryOps::FMul;
1436 case Intrinsic::aarch64_sve_fadd_u:
1437 return Instruction::BinaryOps::FAdd;
1438 case Intrinsic::aarch64_sve_fsub_u:
1439 return Instruction::BinaryOps::FSub;
1440 default:
1441 return Instruction::BinaryOpsEnd;
1442 }
1443}
1444
1445static std::optional<Instruction *>
1447 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1448 if (II.isStrictFP())
1449 return std::nullopt;
1450
1451 auto *OpPredicate = II.getOperand(0);
1452 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1453 if (BinOpCode == Instruction::BinaryOpsEnd ||
1454 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1455 m_ConstantInt<AArch64SVEPredPattern::all>())))
1456 return std::nullopt;
1458 IC.Builder.setFastMathFlags(II.getFastMathFlags());
1459 auto BinOp =
1460 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1461 return IC.replaceInstUsesWith(II, BinOp);
1462}
1463
1464// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1465// sve.add_u).
1466static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1467 Intrinsic::ID IID) {
1468 auto *OpPredicate = II.getOperand(0);
1469 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1470 m_ConstantInt<AArch64SVEPredPattern::all>())))
1471 return std::nullopt;
1472
1473 auto *Mod = II.getModule();
1474 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1475 II.setCalledFunction(NewDecl);
1476
1477 return &II;
1478}
1479
1480// Simplify operations where predicate has all inactive lanes or try to replace
1481// with _u form when all lanes are active
1482static std::optional<Instruction *>
1484 Intrinsic::ID IID) {
1485 if (match(II.getOperand(0), m_ZeroInt())) {
1486 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1487 // inactive for sv[func]_m
1488 return IC.replaceInstUsesWith(II, II.getOperand(1));
1489 }
1490 return instCombineSVEAllActive(II, IID);
1491}
1492
1493static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1494 IntrinsicInst &II) {
1495 if (auto II_U =
1496 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1497 return II_U;
1498 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1499 Intrinsic::aarch64_sve_mla>(
1500 IC, II, true))
1501 return MLA;
1502 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1503 Intrinsic::aarch64_sve_mad>(
1504 IC, II, false))
1505 return MAD;
1506 return std::nullopt;
1507}
1508
1509static std::optional<Instruction *>
1511 if (auto II_U =
1512 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1513 return II_U;
1514 if (auto FMLA =
1515 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1516 Intrinsic::aarch64_sve_fmla>(IC, II,
1517 true))
1518 return FMLA;
1519 if (auto FMAD =
1520 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1521 Intrinsic::aarch64_sve_fmad>(IC, II,
1522 false))
1523 return FMAD;
1524 if (auto FMLA =
1525 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1526 Intrinsic::aarch64_sve_fmla>(IC, II,
1527 true))
1528 return FMLA;
1529 return std::nullopt;
1530}
1531
1532static std::optional<Instruction *>
1534 if (auto FMLA =
1535 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1536 Intrinsic::aarch64_sve_fmla>(IC, II,
1537 true))
1538 return FMLA;
1539 if (auto FMAD =
1540 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1541 Intrinsic::aarch64_sve_fmad>(IC, II,
1542 false))
1543 return FMAD;
1544 if (auto FMLA_U =
1545 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1546 Intrinsic::aarch64_sve_fmla_u>(
1547 IC, II, true))
1548 return FMLA_U;
1549 return instCombineSVEVectorBinOp(IC, II);
1550}
1551
1552static std::optional<Instruction *>
1554 if (auto II_U =
1555 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1556 return II_U;
1557 if (auto FMLS =
1558 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1559 Intrinsic::aarch64_sve_fmls>(IC, II,
1560 true))
1561 return FMLS;
1562 if (auto FMSB =
1563 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1564 Intrinsic::aarch64_sve_fnmsb>(
1565 IC, II, false))
1566 return FMSB;
1567 if (auto FMLS =
1568 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1569 Intrinsic::aarch64_sve_fmls>(IC, II,
1570 true))
1571 return FMLS;
1572 return std::nullopt;
1573}
1574
1575static std::optional<Instruction *>
1577 if (auto FMLS =
1578 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1579 Intrinsic::aarch64_sve_fmls>(IC, II,
1580 true))
1581 return FMLS;
1582 if (auto FMSB =
1583 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1584 Intrinsic::aarch64_sve_fnmsb>(
1585 IC, II, false))
1586 return FMSB;
1587 if (auto FMLS_U =
1588 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1589 Intrinsic::aarch64_sve_fmls_u>(
1590 IC, II, true))
1591 return FMLS_U;
1592 return instCombineSVEVectorBinOp(IC, II);
1593}
1594
1595static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1596 IntrinsicInst &II) {
1597 if (auto II_U =
1598 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1599 return II_U;
1600 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1601 Intrinsic::aarch64_sve_mls>(
1602 IC, II, true))
1603 return MLS;
1604 return std::nullopt;
1605}
1606
1607static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1609 Intrinsic::ID IID) {
1610 auto *OpPredicate = II.getOperand(0);
1611 auto *OpMultiplicand = II.getOperand(1);
1612 auto *OpMultiplier = II.getOperand(2);
1613
1614 // Return true if a given instruction is a unit splat value, false otherwise.
1615 auto IsUnitSplat = [](auto *I) {
1616 auto *SplatValue = getSplatValue(I);
1617 if (!SplatValue)
1618 return false;
1619 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1620 };
1621
1622 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1623 // with a unit splat value, false otherwise.
1624 auto IsUnitDup = [](auto *I) {
1625 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1626 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1627 return false;
1628
1629 auto *SplatValue = IntrI->getOperand(2);
1630 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1631 };
1632
1633 if (IsUnitSplat(OpMultiplier)) {
1634 // [f]mul pg %n, (dupx 1) => %n
1635 OpMultiplicand->takeName(&II);
1636 return IC.replaceInstUsesWith(II, OpMultiplicand);
1637 } else if (IsUnitDup(OpMultiplier)) {
1638 // [f]mul pg %n, (dup pg 1) => %n
1639 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1640 auto *DupPg = DupInst->getOperand(1);
1641 // TODO: this is naive. The optimization is still valid if DupPg
1642 // 'encompasses' OpPredicate, not only if they're the same predicate.
1643 if (OpPredicate == DupPg) {
1644 OpMultiplicand->takeName(&II);
1645 return IC.replaceInstUsesWith(II, OpMultiplicand);
1646 }
1647 }
1648
1649 return instCombineSVEVectorBinOp(IC, II);
1650}
1651
1652static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1653 IntrinsicInst &II) {
1654 Value *UnpackArg = II.getArgOperand(0);
1655 auto *RetTy = cast<ScalableVectorType>(II.getType());
1656 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1657 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1658
1659 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1660 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1661 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1662 ScalarArg =
1663 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1664 Value *NewVal =
1665 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1666 NewVal->takeName(&II);
1667 return IC.replaceInstUsesWith(II, NewVal);
1668 }
1669
1670 return std::nullopt;
1671}
1672static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1673 IntrinsicInst &II) {
1674 auto *OpVal = II.getOperand(0);
1675 auto *OpIndices = II.getOperand(1);
1676 VectorType *VTy = cast<VectorType>(II.getType());
1677
1678 // Check whether OpIndices is a constant splat value < minimal element count
1679 // of result.
1680 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1681 if (!SplatValue ||
1682 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1683 return std::nullopt;
1684
1685 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1686 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1687 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1688 auto *VectorSplat =
1689 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1690
1691 VectorSplat->takeName(&II);
1692 return IC.replaceInstUsesWith(II, VectorSplat);
1693}
1694
1695static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1696 IntrinsicInst &II) {
1697 Value *A, *B;
1698 Type *RetTy = II.getType();
1699 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1700 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1701
1702 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1703 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1704 if ((match(II.getArgOperand(0),
1705 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1706 match(II.getArgOperand(1),
1707 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1708 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1709 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1710 auto *TyA = cast<ScalableVectorType>(A->getType());
1711 if (TyA == B->getType() &&
1713 auto *SubVec = IC.Builder.CreateInsertVector(
1715 auto *ConcatVec = IC.Builder.CreateInsertVector(
1716 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1717 ConcatVec->takeName(&II);
1718 return IC.replaceInstUsesWith(II, ConcatVec);
1719 }
1720 }
1721
1722 return std::nullopt;
1723}
1724
1725static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1726 IntrinsicInst &II) {
1727 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1728 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1729 Value *A, *B;
1730 if (match(II.getArgOperand(0),
1731 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1732 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1733 m_Specific(A), m_Specific(B))))
1734 return IC.replaceInstUsesWith(
1735 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1736
1737 return std::nullopt;
1738}
1739
1740static std::optional<Instruction *>
1742 Value *Mask = II.getOperand(0);
1743 Value *BasePtr = II.getOperand(1);
1744 Value *Index = II.getOperand(2);
1745 Type *Ty = II.getType();
1746 Value *PassThru = ConstantAggregateZero::get(Ty);
1747
1748 // Contiguous gather => masked load.
1749 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1750 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1751 Value *IndexBase;
1752 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1753 m_Value(IndexBase), m_SpecificInt(1)))) {
1754 Align Alignment =
1755 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1756
1757 Type *VecPtrTy = PointerType::getUnqual(Ty);
1758 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1759 BasePtr, IndexBase);
1760 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1761 CallInst *MaskedLoad =
1762 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1763 MaskedLoad->takeName(&II);
1764 return IC.replaceInstUsesWith(II, MaskedLoad);
1765 }
1766
1767 return std::nullopt;
1768}
1769
1770static std::optional<Instruction *>
1772 Value *Val = II.getOperand(0);
1773 Value *Mask = II.getOperand(1);
1774 Value *BasePtr = II.getOperand(2);
1775 Value *Index = II.getOperand(3);
1776 Type *Ty = Val->getType();
1777
1778 // Contiguous scatter => masked store.
1779 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1780 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1781 Value *IndexBase;
1782 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1783 m_Value(IndexBase), m_SpecificInt(1)))) {
1784 Align Alignment =
1785 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1786
1787 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1788 BasePtr, IndexBase);
1789 Type *VecPtrTy = PointerType::getUnqual(Ty);
1790 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1791
1792 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1793
1794 return IC.eraseInstFromFunction(II);
1795 }
1796
1797 return std::nullopt;
1798}
1799
1800static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1801 IntrinsicInst &II) {
1802 Type *Int32Ty = IC.Builder.getInt32Ty();
1803 Value *Pred = II.getOperand(0);
1804 Value *Vec = II.getOperand(1);
1805 Value *DivVec = II.getOperand(2);
1806
1807 Value *SplatValue = getSplatValue(DivVec);
1808 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1809 if (!SplatConstantInt)
1810 return std::nullopt;
1811 APInt Divisor = SplatConstantInt->getValue();
1812
1813 if (Divisor.isPowerOf2()) {
1814 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1815 auto ASRD = IC.Builder.CreateIntrinsic(
1816 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1817 return IC.replaceInstUsesWith(II, ASRD);
1818 }
1819 if (Divisor.isNegatedPowerOf2()) {
1820 Divisor.negate();
1821 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1822 auto ASRD = IC.Builder.CreateIntrinsic(
1823 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1824 auto NEG = IC.Builder.CreateIntrinsic(
1825 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1826 return IC.replaceInstUsesWith(II, NEG);
1827 }
1828
1829 return std::nullopt;
1830}
1831
1832bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1833 size_t VecSize = Vec.size();
1834 if (VecSize == 1)
1835 return true;
1836 if (!isPowerOf2_64(VecSize))
1837 return false;
1838 size_t HalfVecSize = VecSize / 2;
1839
1840 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1841 RHS != Vec.end(); LHS++, RHS++) {
1842 if (*LHS != nullptr && *RHS != nullptr) {
1843 if (*LHS == *RHS)
1844 continue;
1845 else
1846 return false;
1847 }
1848 if (!AllowPoison)
1849 return false;
1850 if (*LHS == nullptr && *RHS != nullptr)
1851 *LHS = *RHS;
1852 }
1853
1854 Vec.resize(HalfVecSize);
1855 SimplifyValuePattern(Vec, AllowPoison);
1856 return true;
1857}
1858
1859// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1860// to dupqlane(f64(C)) where C is A concatenated with B
1861static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1862 IntrinsicInst &II) {
1863 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1864 if (!match(II.getOperand(0),
1865 m_Intrinsic<Intrinsic::vector_insert>(
1866 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1867 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1868 return std::nullopt;
1869 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1870
1871 // Insert the scalars into a container ordered by InsertElement index
1872 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1873 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1874 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1875 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1876 CurrentInsertElt = InsertElt->getOperand(0);
1877 }
1878
1879 bool AllowPoison =
1880 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1881 if (!SimplifyValuePattern(Elts, AllowPoison))
1882 return std::nullopt;
1883
1884 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1885 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1886 for (size_t I = 0; I < Elts.size(); I++) {
1887 if (Elts[I] == nullptr)
1888 continue;
1889 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1890 IC.Builder.getInt64(I));
1891 }
1892 if (InsertEltChain == nullptr)
1893 return std::nullopt;
1894
1895 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1896 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1897 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1898 // be narrowed back to the original type.
1899 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1900 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1901 IIScalableTy->getMinNumElements() /
1902 PatternWidth;
1903
1904 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1905 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1906 auto *WideShuffleMaskTy =
1907 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1908
1909 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1910 auto InsertSubvector = IC.Builder.CreateInsertVector(
1911 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1912 auto WideBitcast =
1913 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1914 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1915 auto WideShuffle = IC.Builder.CreateShuffleVector(
1916 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1917 auto NarrowBitcast =
1918 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1919
1920 return IC.replaceInstUsesWith(II, NarrowBitcast);
1921}
1922
1923static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1924 IntrinsicInst &II) {
1925 Value *A = II.getArgOperand(0);
1926 Value *B = II.getArgOperand(1);
1927 if (A == B)
1928 return IC.replaceInstUsesWith(II, A);
1929
1930 return std::nullopt;
1931}
1932
1933static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1934 IntrinsicInst &II) {
1935 Value *Pred = II.getOperand(0);
1936 Value *Vec = II.getOperand(1);
1937 Value *Shift = II.getOperand(2);
1938
1939 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1940 Value *AbsPred, *MergedValue;
1941 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1942 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1943 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1944 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1945
1946 return std::nullopt;
1947
1948 // Transform is valid if any of the following are true:
1949 // * The ABS merge value is an undef or non-negative
1950 // * The ABS predicate is all active
1951 // * The ABS predicate and the SRSHL predicates are the same
1952 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1953 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1954 return std::nullopt;
1955
1956 // Only valid when the shift amount is non-negative, otherwise the rounding
1957 // behaviour of SRSHL cannot be ignored.
1958 if (!match(Shift, m_NonNegative()))
1959 return std::nullopt;
1960
1961 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1962 {II.getType()}, {Pred, Vec, Shift});
1963
1964 return IC.replaceInstUsesWith(II, LSL);
1965}
1966
1967std::optional<Instruction *>
1969 IntrinsicInst &II) const {
1970 Intrinsic::ID IID = II.getIntrinsicID();
1971 switch (IID) {
1972 default:
1973 break;
1974 case Intrinsic::aarch64_neon_fmaxnm:
1975 case Intrinsic::aarch64_neon_fminnm:
1976 return instCombineMaxMinNM(IC, II);
1977 case Intrinsic::aarch64_sve_convert_from_svbool:
1978 return instCombineConvertFromSVBool(IC, II);
1979 case Intrinsic::aarch64_sve_dup:
1980 return instCombineSVEDup(IC, II);
1981 case Intrinsic::aarch64_sve_dup_x:
1982 return instCombineSVEDupX(IC, II);
1983 case Intrinsic::aarch64_sve_cmpne:
1984 case Intrinsic::aarch64_sve_cmpne_wide:
1985 return instCombineSVECmpNE(IC, II);
1986 case Intrinsic::aarch64_sve_rdffr:
1987 return instCombineRDFFR(IC, II);
1988 case Intrinsic::aarch64_sve_lasta:
1989 case Intrinsic::aarch64_sve_lastb:
1990 return instCombineSVELast(IC, II);
1991 case Intrinsic::aarch64_sve_clasta_n:
1992 case Intrinsic::aarch64_sve_clastb_n:
1993 return instCombineSVECondLast(IC, II);
1994 case Intrinsic::aarch64_sve_cntd:
1995 return instCombineSVECntElts(IC, II, 2);
1996 case Intrinsic::aarch64_sve_cntw:
1997 return instCombineSVECntElts(IC, II, 4);
1998 case Intrinsic::aarch64_sve_cnth:
1999 return instCombineSVECntElts(IC, II, 8);
2000 case Intrinsic::aarch64_sve_cntb:
2001 return instCombineSVECntElts(IC, II, 16);
2002 case Intrinsic::aarch64_sve_ptest_any:
2003 case Intrinsic::aarch64_sve_ptest_first:
2004 case Intrinsic::aarch64_sve_ptest_last:
2005 return instCombineSVEPTest(IC, II);
2006 case Intrinsic::aarch64_sve_fabd:
2007 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2008 case Intrinsic::aarch64_sve_fadd:
2009 return instCombineSVEVectorFAdd(IC, II);
2010 case Intrinsic::aarch64_sve_fadd_u:
2011 return instCombineSVEVectorFAddU(IC, II);
2012 case Intrinsic::aarch64_sve_fdiv:
2013 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2014 case Intrinsic::aarch64_sve_fmax:
2015 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2016 case Intrinsic::aarch64_sve_fmaxnm:
2017 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2018 case Intrinsic::aarch64_sve_fmin:
2019 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2020 case Intrinsic::aarch64_sve_fminnm:
2021 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2022 case Intrinsic::aarch64_sve_fmla:
2023 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2024 case Intrinsic::aarch64_sve_fmls:
2025 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2026 case Intrinsic::aarch64_sve_fmul:
2027 if (auto II_U =
2028 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2029 return II_U;
2030 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2031 case Intrinsic::aarch64_sve_fmul_u:
2032 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2033 case Intrinsic::aarch64_sve_fmulx:
2034 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2035 case Intrinsic::aarch64_sve_fnmla:
2036 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2037 case Intrinsic::aarch64_sve_fnmls:
2038 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2039 case Intrinsic::aarch64_sve_fsub:
2040 return instCombineSVEVectorFSub(IC, II);
2041 case Intrinsic::aarch64_sve_fsub_u:
2042 return instCombineSVEVectorFSubU(IC, II);
2043 case Intrinsic::aarch64_sve_add:
2044 return instCombineSVEVectorAdd(IC, II);
2045 case Intrinsic::aarch64_sve_add_u:
2046 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2047 Intrinsic::aarch64_sve_mla_u>(
2048 IC, II, true);
2049 case Intrinsic::aarch64_sve_mla:
2050 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2051 case Intrinsic::aarch64_sve_mls:
2052 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2053 case Intrinsic::aarch64_sve_mul:
2054 if (auto II_U =
2055 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2056 return II_U;
2057 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2058 case Intrinsic::aarch64_sve_mul_u:
2059 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2060 case Intrinsic::aarch64_sve_sabd:
2061 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2062 case Intrinsic::aarch64_sve_smax:
2063 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2064 case Intrinsic::aarch64_sve_smin:
2065 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2066 case Intrinsic::aarch64_sve_smulh:
2067 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2068 case Intrinsic::aarch64_sve_sub:
2069 return instCombineSVEVectorSub(IC, II);
2070 case Intrinsic::aarch64_sve_sub_u:
2071 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2072 Intrinsic::aarch64_sve_mls_u>(
2073 IC, II, true);
2074 case Intrinsic::aarch64_sve_uabd:
2075 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2076 case Intrinsic::aarch64_sve_umax:
2077 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2078 case Intrinsic::aarch64_sve_umin:
2079 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2080 case Intrinsic::aarch64_sve_umulh:
2081 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2082 case Intrinsic::aarch64_sve_asr:
2083 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2084 case Intrinsic::aarch64_sve_lsl:
2085 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2086 case Intrinsic::aarch64_sve_lsr:
2087 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2088 case Intrinsic::aarch64_sve_and:
2089 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2090 case Intrinsic::aarch64_sve_bic:
2091 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2092 case Intrinsic::aarch64_sve_eor:
2093 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2094 case Intrinsic::aarch64_sve_orr:
2095 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2096 case Intrinsic::aarch64_sve_sqsub:
2097 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2098 case Intrinsic::aarch64_sve_uqsub:
2099 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2100 case Intrinsic::aarch64_sve_tbl:
2101 return instCombineSVETBL(IC, II);
2102 case Intrinsic::aarch64_sve_uunpkhi:
2103 case Intrinsic::aarch64_sve_uunpklo:
2104 case Intrinsic::aarch64_sve_sunpkhi:
2105 case Intrinsic::aarch64_sve_sunpklo:
2106 return instCombineSVEUnpack(IC, II);
2107 case Intrinsic::aarch64_sve_uzp1:
2108 return instCombineSVEUzp1(IC, II);
2109 case Intrinsic::aarch64_sve_zip1:
2110 case Intrinsic::aarch64_sve_zip2:
2111 return instCombineSVEZip(IC, II);
2112 case Intrinsic::aarch64_sve_ld1_gather_index:
2113 return instCombineLD1GatherIndex(IC, II);
2114 case Intrinsic::aarch64_sve_st1_scatter_index:
2115 return instCombineST1ScatterIndex(IC, II);
2116 case Intrinsic::aarch64_sve_ld1:
2117 return instCombineSVELD1(IC, II, DL);
2118 case Intrinsic::aarch64_sve_st1:
2119 return instCombineSVEST1(IC, II, DL);
2120 case Intrinsic::aarch64_sve_sdiv:
2121 return instCombineSVESDIV(IC, II);
2122 case Intrinsic::aarch64_sve_sel:
2123 return instCombineSVESel(IC, II);
2124 case Intrinsic::aarch64_sve_srshl:
2125 return instCombineSVESrshl(IC, II);
2126 case Intrinsic::aarch64_sve_dupq_lane:
2127 return instCombineSVEDupqLane(IC, II);
2128 }
2129
2130 return std::nullopt;
2131}
2132
2134 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2135 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2136 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2137 SimplifyAndSetOp) const {
2138 switch (II.getIntrinsicID()) {
2139 default:
2140 break;
2141 case Intrinsic::aarch64_neon_fcvtxn:
2142 case Intrinsic::aarch64_neon_rshrn:
2143 case Intrinsic::aarch64_neon_sqrshrn:
2144 case Intrinsic::aarch64_neon_sqrshrun:
2145 case Intrinsic::aarch64_neon_sqshrn:
2146 case Intrinsic::aarch64_neon_sqshrun:
2147 case Intrinsic::aarch64_neon_sqxtn:
2148 case Intrinsic::aarch64_neon_sqxtun:
2149 case Intrinsic::aarch64_neon_uqrshrn:
2150 case Intrinsic::aarch64_neon_uqshrn:
2151 case Intrinsic::aarch64_neon_uqxtn:
2152 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2153 break;
2154 }
2155
2156 return std::nullopt;
2157}
2158
2161 switch (K) {
2163 return TypeSize::getFixed(64);
2166 return TypeSize::getFixed(0);
2167
2168 if (ST->hasSVE())
2169 return TypeSize::getFixed(
2170 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2171
2172 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
2175 return TypeSize::getScalable(0);
2176
2177 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
2178 }
2179 llvm_unreachable("Unsupported register kind");
2180}
2181
2182bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2184 Type *SrcOverrideTy) {
2185 // A helper that returns a vector type from the given type. The number of
2186 // elements in type Ty determines the vector width.
2187 auto toVectorTy = [&](Type *ArgTy) {
2188 return VectorType::get(ArgTy->getScalarType(),
2189 cast<VectorType>(DstTy)->getElementCount());
2190 };
2191
2192 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2193 // i32, i64]. SVE doesn't generally have the same set of instructions to
2194 // perform an extend with the add/sub/mul. There are SMULLB style
2195 // instructions, but they operate on top/bottom, requiring some sort of lane
2196 // interleaving to be used with zext/sext.
2197 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2198 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2199 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2200 return false;
2201
2202 // Determine if the operation has a widening variant. We consider both the
2203 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2204 // instructions.
2205 //
2206 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2207 // verify that their extending operands are eliminated during code
2208 // generation.
2209 Type *SrcTy = SrcOverrideTy;
2210 switch (Opcode) {
2211 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2212 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2213 // The second operand needs to be an extend
2214 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2215 if (!SrcTy)
2216 SrcTy =
2217 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2218 } else
2219 return false;
2220 break;
2221 case Instruction::Mul: { // SMULL(2), UMULL(2)
2222 // Both operands need to be extends of the same type.
2223 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2224 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2225 if (!SrcTy)
2226 SrcTy =
2227 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2228 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2229 // If one of the operands is a Zext and the other has enough zero bits to
2230 // be treated as unsigned, we can still general a umull, meaning the zext
2231 // is free.
2232 KnownBits Known =
2233 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2234 if (Args[0]->getType()->getScalarSizeInBits() -
2235 Known.Zero.countLeadingOnes() >
2236 DstTy->getScalarSizeInBits() / 2)
2237 return false;
2238 if (!SrcTy)
2239 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2240 DstTy->getScalarSizeInBits() / 2));
2241 } else
2242 return false;
2243 break;
2244 }
2245 default:
2246 return false;
2247 }
2248
2249 // Legalize the destination type and ensure it can be used in a widening
2250 // operation.
2251 auto DstTyL = getTypeLegalizationCost(DstTy);
2252 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2253 return false;
2254
2255 // Legalize the source type and ensure it can be used in a widening
2256 // operation.
2257 assert(SrcTy && "Expected some SrcTy");
2258 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2259 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2260 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2261 return false;
2262
2263 // Get the total number of vector elements in the legalized types.
2264 InstructionCost NumDstEls =
2265 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2266 InstructionCost NumSrcEls =
2267 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2268
2269 // Return true if the legalized types have the same number of vector elements
2270 // and the destination element type size is twice that of the source type.
2271 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2272}
2273
2274// s/urhadd instructions implement the following pattern, making the
2275// extends free:
2276// %x = add ((zext i8 -> i16), 1)
2277// %y = (zext i8 -> i16)
2278// trunc i16 (lshr (add %x, %y), 1) -> i8
2279//
2281 Type *Src) {
2282 // The source should be a legal vector type.
2283 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2284 (Src->isScalableTy() && !ST->hasSVE2()))
2285 return false;
2286
2287 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2288 return false;
2289
2290 // Look for trunc/shl/add before trying to match the pattern.
2291 const Instruction *Add = ExtUser;
2292 auto *AddUser =
2293 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2294 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2295 Add = AddUser;
2296
2297 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2298 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2299 return false;
2300
2301 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2302 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2303 Src->getScalarSizeInBits() !=
2304 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2305 return false;
2306
2307 // Try to match the whole pattern. Ext could be either the first or second
2308 // m_ZExtOrSExt matched.
2309 Instruction *Ex1, *Ex2;
2310 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2311 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2312 return false;
2313
2314 // Ensure both extends are of the same type
2315 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2316 Ex1->getOpcode() == Ex2->getOpcode())
2317 return true;
2318
2319 return false;
2320}
2321
2323 Type *Src,
2326 const Instruction *I) {
2327 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2328 assert(ISD && "Invalid opcode");
2329 // If the cast is observable, and it is used by a widening instruction (e.g.,
2330 // uaddl, saddw, etc.), it may be free.
2331 if (I && I->hasOneUser()) {
2332 auto *SingleUser = cast<Instruction>(*I->user_begin());
2333 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2334 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2335 // For adds only count the second operand as free if both operands are
2336 // extends but not the same operation. (i.e both operands are not free in
2337 // add(sext, zext)).
2338 if (SingleUser->getOpcode() == Instruction::Add) {
2339 if (I == SingleUser->getOperand(1) ||
2340 (isa<CastInst>(SingleUser->getOperand(1)) &&
2341 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2342 return 0;
2343 } else // Others are free so long as isWideningInstruction returned true.
2344 return 0;
2345 }
2346
2347 // The cast will be free for the s/urhadd instructions
2348 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2349 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2350 return 0;
2351 }
2352
2353 // TODO: Allow non-throughput costs that aren't binary.
2354 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2356 return Cost == 0 ? 0 : 1;
2357 return Cost;
2358 };
2359
2360 EVT SrcTy = TLI->getValueType(DL, Src);
2361 EVT DstTy = TLI->getValueType(DL, Dst);
2362
2363 if (!SrcTy.isSimple() || !DstTy.isSimple())
2364 return AdjustCost(
2365 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2366
2367 static const TypeConversionCostTblEntry
2368 ConversionTbl[] = {
2369 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2370 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2371 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2372 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2373 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2374 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2375 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2376 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2377 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2378 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2379 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2380 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2381 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2382 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2383 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2384 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2385 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2386 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2387 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2388 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2389
2390 // Truncations on nxvmiN
2391 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2392 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2393 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2394 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2395 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2396 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2397 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2398 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2399 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2400 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2401 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2402 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2403 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2404 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2405 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2406 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2407
2408 // The number of shll instructions for the extension.
2409 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2410 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2411 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2412 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2413 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2414 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2415 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2416 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2417 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2418 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2419 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2420 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2421 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2422 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2423 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2424 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2425
2426 // LowerVectorINT_TO_FP:
2427 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2428 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2429 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2430 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2431 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2432 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2433
2434 // Complex: to v2f32
2435 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2436 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2437 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2438 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2439 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2440 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2441
2442 // Complex: to v4f32
2443 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2444 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2445 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2446 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2447
2448 // Complex: to v8f32
2449 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2450 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2451 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2452 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2453
2454 // Complex: to v16f32
2455 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2456 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2457
2458 // Complex: to v2f64
2459 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2460 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2461 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2462 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2463 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2464 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2465
2466 // Complex: to v4f64
2467 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2468 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2469
2470 // LowerVectorFP_TO_INT
2471 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2472 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2473 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2474 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2475 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2476 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2477
2478 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2479 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2480 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2481 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2482 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2483 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2484 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2485
2486 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2487 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2488 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2489 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2490 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2491
2492 // Complex, from nxv2f32.
2493 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2494 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2495 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2496 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2497 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2498 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2499 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2500 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2501
2502 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2503 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2504 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2505 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2506 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2507 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2508 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2509
2510 // Complex, from nxv2f64.
2511 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2512 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2513 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2514 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2515 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2516 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2517 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2518 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2519
2520 // Complex, from nxv4f32.
2521 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2522 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2523 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2524 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2525 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2526 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2527 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2528 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2529
2530 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2531 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2532 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2533 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2534 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2535
2536 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2537 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2538 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2539 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2540 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2541 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2542 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2543
2544 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2545 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2546 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2547 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2548 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2549
2550 // Complex, from nxv8f16.
2551 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2552 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2553 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2554 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2555 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2556 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2557 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2558 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2559
2560 // Complex, from nxv4f16.
2561 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2562 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2563 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2564 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2565 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2566 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2567 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2568 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2569
2570 // Complex, from nxv2f16.
2571 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2572 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2573 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2574 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2575 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2576 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2577 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2578 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2579
2580 // Truncate from nxvmf32 to nxvmf16.
2581 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2582 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2583 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2584
2585 // Truncate from nxvmf64 to nxvmf16.
2586 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2587 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2588 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2589
2590 // Truncate from nxvmf64 to nxvmf32.
2591 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2592 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2593 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2594
2595 // Extend from nxvmf16 to nxvmf32.
2596 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2597 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2598 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2599
2600 // Extend from nxvmf16 to nxvmf64.
2601 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2602 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2603 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2604
2605 // Extend from nxvmf32 to nxvmf64.
2606 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2607 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2608 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2609
2610 // Bitcasts from float to integer
2611 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2612 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2613 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2614
2615 // Bitcasts from integer to float
2616 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2617 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2618 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2619
2620 // Add cost for extending to illegal -too wide- scalable vectors.
2621 // zero/sign extend are implemented by multiple unpack operations,
2622 // where each operation has a cost of 1.
2623 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2624 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2625 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2626 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2627 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2628 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2629
2630 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2631 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2632 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2633 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2634 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2635 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2636 };
2637
2638 // We have to estimate a cost of fixed length operation upon
2639 // SVE registers(operations) with the number of registers required
2640 // for a fixed type to be represented upon SVE registers.
2641 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2642 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2643 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2644 ST->useSVEForFixedLengthVectors(WiderTy)) {
2645 std::pair<InstructionCost, MVT> LT =
2646 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2647 unsigned NumElements = AArch64::SVEBitsPerBlock /
2648 LT.second.getVectorElementType().getSizeInBits();
2649 return AdjustCost(
2650 LT.first *
2652 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2653 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2654 CostKind, I));
2655 }
2656
2657 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2658 DstTy.getSimpleVT(),
2659 SrcTy.getSimpleVT()))
2660 return AdjustCost(Entry->Cost);
2661
2662 static const TypeConversionCostTblEntry FP16Tbl[] = {
2663 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2664 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2665 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2666 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2667 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2668 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2669 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2670 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2671 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2672 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2673 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2674 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2675 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2676 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2677 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2678 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2679 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2680 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2681 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2682 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2683 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2684 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2685 };
2686
2687 if (ST->hasFullFP16())
2688 if (const auto *Entry = ConvertCostTableLookup(
2689 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2690 return AdjustCost(Entry->Cost);
2691
2692 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2693 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2694 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2696 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2698 // The standard behaviour in the backend for these cases is to split the
2699 // extend up into two parts:
2700 // 1. Perform an extending load or masked load up to the legal type.
2701 // 2. Extend the loaded data to the final type.
2702 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2703 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2705 Opcode, LegalTy, Src, CCH, CostKind, I);
2707 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2708 return Part1 + Part2;
2709 }
2710
2711 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2712 // but we also want to include the TTI::CastContextHint::Masked case too.
2713 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2714 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2715 TLI->isTypeLegal(DstTy))
2717
2718 return AdjustCost(
2719 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2720}
2721
2723 Type *Dst,
2724 VectorType *VecTy,
2725 unsigned Index) {
2726
2727 // Make sure we were given a valid extend opcode.
2728 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2729 "Invalid opcode");
2730
2731 // We are extending an element we extract from a vector, so the source type
2732 // of the extend is the element type of the vector.
2733 auto *Src = VecTy->getElementType();
2734
2735 // Sign- and zero-extends are for integer types only.
2736 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2737
2738 // Get the cost for the extract. We compute the cost (if any) for the extend
2739 // below.
2741 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2742 CostKind, Index, nullptr, nullptr);
2743
2744 // Legalize the types.
2745 auto VecLT = getTypeLegalizationCost(VecTy);
2746 auto DstVT = TLI->getValueType(DL, Dst);
2747 auto SrcVT = TLI->getValueType(DL, Src);
2748
2749 // If the resulting type is still a vector and the destination type is legal,
2750 // we may get the extension for free. If not, get the default cost for the
2751 // extend.
2752 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2753 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2754 CostKind);
2755
2756 // The destination type should be larger than the element type. If not, get
2757 // the default cost for the extend.
2758 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2759 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2760 CostKind);
2761
2762 switch (Opcode) {
2763 default:
2764 llvm_unreachable("Opcode should be either SExt or ZExt");
2765
2766 // For sign-extends, we only need a smov, which performs the extension
2767 // automatically.
2768 case Instruction::SExt:
2769 return Cost;
2770
2771 // For zero-extends, the extend is performed automatically by a umov unless
2772 // the destination type is i64 and the element type is i8 or i16.
2773 case Instruction::ZExt:
2774 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2775 return Cost;
2776 }
2777
2778 // If we are unable to perform the extend for free, get the default cost.
2779 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2780 CostKind);
2781}
2782
2785 const Instruction *I) {
2787 return Opcode == Instruction::PHI ? 0 : 1;
2788 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2789 // Branches are assumed to be predicted.
2790 return 0;
2791}
2792
2793InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2794 Type *Val,
2795 unsigned Index,
2796 bool HasRealUse) {
2797 assert(Val->isVectorTy() && "This must be a vector type");
2798
2799 if (Index != -1U) {
2800 // Legalize the type.
2801 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2802
2803 // This type is legalized to a scalar type.
2804 if (!LT.second.isVector())
2805 return 0;
2806
2807 // The type may be split. For fixed-width vectors we can normalize the
2808 // index to the new type.
2809 if (LT.second.isFixedLengthVector()) {
2810 unsigned Width = LT.second.getVectorNumElements();
2811 Index = Index % Width;
2812 }
2813
2814 // The element at index zero is already inside the vector.
2815 // - For a physical (HasRealUse==true) insert-element or extract-element
2816 // instruction that extracts integers, an explicit FPR -> GPR move is
2817 // needed. So it has non-zero cost.
2818 // - For the rest of cases (virtual instruction or element type is float),
2819 // consider the instruction free.
2820 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2821 return 0;
2822
2823 // This is recognising a LD1 single-element structure to one lane of one
2824 // register instruction. I.e., if this is an `insertelement` instruction,
2825 // and its second operand is a load, then we will generate a LD1, which
2826 // are expensive instructions.
2827 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2828 return ST->getVectorInsertExtractBaseCost() + 1;
2829
2830 // i1 inserts and extract will include an extra cset or cmp of the vector
2831 // value. Increase the cost by 1 to account.
2832 if (Val->getScalarSizeInBits() == 1)
2833 return ST->getVectorInsertExtractBaseCost() + 1;
2834
2835 // FIXME:
2836 // If the extract-element and insert-element instructions could be
2837 // simplified away (e.g., could be combined into users by looking at use-def
2838 // context), they have no cost. This is not done in the first place for
2839 // compile-time considerations.
2840 }
2841
2842 // All other insert/extracts cost this much.
2843 return ST->getVectorInsertExtractBaseCost();
2844}
2845
2848 unsigned Index, Value *Op0,
2849 Value *Op1) {
2850 bool HasRealUse =
2851 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2852 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2853}
2854
2856 Type *Val,
2858 unsigned Index) {
2859 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2860}
2861
2863 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
2865 if (isa<ScalableVectorType>(Ty))
2867 if (Ty->getElementType()->isFloatingPointTy())
2868 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
2869 CostKind);
2870 return DemandedElts.popcount() * (Insert + Extract) *
2872}
2873
2875 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2878 const Instruction *CxtI) {
2879
2880 // TODO: Handle more cost kinds.
2882 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2883 Op2Info, Args, CxtI);
2884
2885 // Legalize the type.
2886 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2887 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2888
2889 switch (ISD) {
2890 default:
2891 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2892 Op2Info);
2893 case ISD::SDIV:
2894 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2895 // On AArch64, scalar signed division by constants power-of-two are
2896 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2897 // The OperandValue properties many not be same as that of previous
2898 // operation; conservatively assume OP_None.
2900 Instruction::Add, Ty, CostKind,
2901 Op1Info.getNoProps(), Op2Info.getNoProps());
2902 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2903 Op1Info.getNoProps(), Op2Info.getNoProps());
2905 Instruction::Select, Ty, CostKind,
2906 Op1Info.getNoProps(), Op2Info.getNoProps());
2907 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2908 Op1Info.getNoProps(), Op2Info.getNoProps());
2909 return Cost;
2910 }
2911 [[fallthrough]];
2912 case ISD::UDIV: {
2913 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2914 auto VT = TLI->getValueType(DL, Ty);
2915 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2916 // Vector signed division by constant are expanded to the
2917 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2918 // to MULHS + SUB + SRL + ADD + SRL.
2920 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2922 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2924 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2925 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2926 }
2927 }
2928
2930 Opcode, Ty, CostKind, Op1Info, Op2Info);
2931 if (Ty->isVectorTy()) {
2932 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2933 // SDIV/UDIV operations are lowered using SVE, then we can have less
2934 // costs.
2935 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2936 ->getPrimitiveSizeInBits()
2937 .getFixedValue() < 128) {
2938 EVT VT = TLI->getValueType(DL, Ty);
2939 static const CostTblEntry DivTbl[]{
2940 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2941 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2942 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2943 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2944 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2945 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2946
2947 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2948 if (nullptr != Entry)
2949 return Entry->Cost;
2950 }
2951 // For 8/16-bit elements, the cost is higher because the type
2952 // requires promotion and possibly splitting:
2953 if (LT.second.getScalarType() == MVT::i8)
2954 Cost *= 8;
2955 else if (LT.second.getScalarType() == MVT::i16)
2956 Cost *= 4;
2957 return Cost;
2958 } else {
2959 // If one of the operands is a uniform constant then the cost for each
2960 // element is Cost for insertion, extraction and division.
2961 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2962 // operation with scalar type
2963 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2964 (Op2Info.isConstant() && Op2Info.isUniform())) {
2965 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2967 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2968 return (4 + DivCost) * VTy->getNumElements();
2969 }
2970 }
2971 // On AArch64, without SVE, vector divisions are expanded
2972 // into scalar divisions of each pair of elements.
2973 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2974 CostKind, Op1Info, Op2Info);
2975 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2976 Op1Info, Op2Info);
2977 }
2978
2979 // TODO: if one of the arguments is scalar, then it's not necessary to
2980 // double the cost of handling the vector elements.
2981 Cost += Cost;
2982 }
2983 return Cost;
2984 }
2985 case ISD::MUL:
2986 // When SVE is available, then we can lower the v2i64 operation using
2987 // the SVE mul instruction, which has a lower cost.
2988 if (LT.second == MVT::v2i64 && ST->hasSVE())
2989 return LT.first;
2990
2991 // When SVE is not available, there is no MUL.2d instruction,
2992 // which means mul <2 x i64> is expensive as elements are extracted
2993 // from the vectors and the muls scalarized.
2994 // As getScalarizationOverhead is a bit too pessimistic, we
2995 // estimate the cost for a i64 vector directly here, which is:
2996 // - four 2-cost i64 extracts,
2997 // - two 2-cost i64 inserts, and
2998 // - two 1-cost muls.
2999 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3000 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3001 // need to scalarize so the cost can be cheaper (smull or umull).
3002 // so the cost can be cheaper (smull or umull).
3003 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3004 return LT.first;
3005 return LT.first * 14;
3006 case ISD::ADD:
3007 case ISD::XOR:
3008 case ISD::OR:
3009 case ISD::AND:
3010 case ISD::SRL:
3011 case ISD::SRA:
3012 case ISD::SHL:
3013 // These nodes are marked as 'custom' for combining purposes only.
3014 // We know that they are legal. See LowerAdd in ISelLowering.
3015 return LT.first;
3016
3017 case ISD::FNEG:
3018 case ISD::FADD:
3019 case ISD::FSUB:
3020 // Increase the cost for half and bfloat types if not architecturally
3021 // supported.
3022 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3023 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3024 return 2 * LT.first;
3025 if (!Ty->getScalarType()->isFP128Ty())
3026 return LT.first;
3027 [[fallthrough]];
3028 case ISD::FMUL:
3029 case ISD::FDIV:
3030 // These nodes are marked as 'custom' just to lower them to SVE.
3031 // We know said lowering will incur no additional cost.
3032 if (!Ty->getScalarType()->isFP128Ty())
3033 return 2 * LT.first;
3034
3035 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3036 Op2Info);
3037 case ISD::FREM:
3038 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3039 // those functions are not declared in the module.
3040 if (!Ty->isVectorTy())
3041 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3042 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3043 Op2Info);
3044 }
3045}
3046
3048 ScalarEvolution *SE,
3049 const SCEV *Ptr) {
3050 // Address computations in vectorized code with non-consecutive addresses will
3051 // likely result in more instructions compared to scalar code where the
3052 // computation can more often be merged into the index mode. The resulting
3053 // extra micro-ops can significantly decrease throughput.
3054 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3055 int MaxMergeDistance = 64;
3056
3057 if (Ty->isVectorTy() && SE &&
3058 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3059 return NumVectorInstToHideOverhead;
3060
3061 // In many cases the address computation is not merged into the instruction
3062 // addressing mode.
3063 return 1;
3064}
3065
3067 Type *CondTy,
3068 CmpInst::Predicate VecPred,
3070 const Instruction *I) {
3071 // TODO: Handle other cost kinds.
3073 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3074 I);
3075
3076 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3077 // We don't lower some vector selects well that are wider than the register
3078 // width.
3079 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3080 // We would need this many instructions to hide the scalarization happening.
3081 const int AmortizationCost = 20;
3082
3083 // If VecPred is not set, check if we can get a predicate from the context
3084 // instruction, if its type matches the requested ValTy.
3085 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3086 CmpInst::Predicate CurrentPred;
3087 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3088 m_Value())))
3089 VecPred = CurrentPred;
3090 }
3091 // Check if we have a compare/select chain that can be lowered using
3092 // a (F)CMxx & BFI pair.
3093 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3094 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3095 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3096 VecPred == CmpInst::FCMP_UNE) {
3097 static const auto ValidMinMaxTys = {
3098 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3099 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3100 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3101
3102 auto LT = getTypeLegalizationCost(ValTy);
3103 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3104 (ST->hasFullFP16() &&
3105 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3106 return LT.first;
3107 }
3108
3109 static const TypeConversionCostTblEntry
3110 VectorSelectTbl[] = {
3111 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3112 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3113 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3114 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3115 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3116 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3117 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3118 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3119 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3120 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3121 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3122 };
3123
3124 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3125 EVT SelValTy = TLI->getValueType(DL, ValTy);
3126 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3127 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3128 SelCondTy.getSimpleVT(),
3129 SelValTy.getSimpleVT()))
3130 return Entry->Cost;
3131 }
3132 }
3133
3134 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3135 auto LT = getTypeLegalizationCost(ValTy);
3136 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3137 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3138 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3139 }
3140
3141 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3142 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3143 // be profitable.
3144 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3145 ICmpInst::isEquality(VecPred) &&
3146 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3147 match(I->getOperand(1), m_Zero()) &&
3148 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3149 return 0;
3150
3151 // The base case handles scalable vectors fine for now, since it treats the
3152 // cost as 1 * legalization cost.
3153 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3154}
3155
3157AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3159 if (ST->requiresStrictAlign()) {
3160 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3161 // a bunch of instructions when strict align is enabled.
3162 return Options;
3163 }
3164 Options.AllowOverlappingLoads = true;
3165 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3166 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3167 // TODO: Though vector loads usually perform well on AArch64, in some targets
3168 // they may wake up the FP unit, which raises the power consumption. Perhaps
3169 // they could be used with no holds barred (-O3).
3170 Options.LoadSizes = {8, 4, 2, 1};
3171 Options.AllowedTailExpansions = {3, 5, 6};
3172 return Options;
3173}
3174
3176 return ST->hasSVE();
3177}
3178
3181 Align Alignment, unsigned AddressSpace,
3183 if (useNeonVector(Src))
3184 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3185 CostKind);
3186 auto LT = getTypeLegalizationCost(Src);
3187 if (!LT.first.isValid())
3189
3190 // The code-generator is currently not able to handle scalable vectors
3191 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3192 // it. This change will be removed when code-generation for these types is
3193 // sufficiently reliable.
3194 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
3196
3197 return LT.first;
3198}
3199
3200static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3201 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3202}
3203
3205 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3206 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3207 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3208 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3209 Alignment, CostKind, I);
3210 auto *VT = cast<VectorType>(DataTy);
3211 auto LT = getTypeLegalizationCost(DataTy);
3212 if (!LT.first.isValid())
3214
3215 if (!LT.second.isVector() ||
3216 !isElementTypeLegalForScalableVector(VT->getElementType()))
3218
3219 // The code-generator is currently not able to handle scalable vectors
3220 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3221 // it. This change will be removed when code-generation for these types is
3222 // sufficiently reliable.
3223 if (cast<VectorType>(DataTy)->getElementCount() ==
3226
3227 ElementCount LegalVF = LT.second.getVectorElementCount();
3228 InstructionCost MemOpCost =
3229 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3230 {TTI::OK_AnyValue, TTI::OP_None}, I);
3231 // Add on an overhead cost for using gathers/scatters.
3232 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
3233 // point we may want a per-CPU overhead.
3234 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3235 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3236}
3237
3239 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3240}
3241
3243 MaybeAlign Alignment,
3244 unsigned AddressSpace,
3246 TTI::OperandValueInfo OpInfo,
3247 const Instruction *I) {
3248 EVT VT = TLI->getValueType(DL, Ty, true);
3249 // Type legalization can't handle structs
3250 if (VT == MVT::Other)
3251 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3252 CostKind);
3253
3254 auto LT = getTypeLegalizationCost(Ty);
3255 if (!LT.first.isValid())
3257
3258 // The code-generator is currently not able to handle scalable vectors
3259 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3260 // it. This change will be removed when code-generation for these types is
3261 // sufficiently reliable.
3262 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3263 if (VTy->getElementCount() == ElementCount::getScalable(1))
3265
3266 // TODO: consider latency as well for TCK_SizeAndLatency.
3268 return LT.first;
3269
3271 return 1;
3272
3273 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3274 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3275 // Unaligned stores are extremely inefficient. We don't split all
3276 // unaligned 128-bit stores because the negative impact that has shown in
3277 // practice on inlined block copy code.
3278 // We make such stores expensive so that we will only vectorize if there
3279 // are 6 other instructions getting vectorized.
3280 const int AmortizationCost = 6;
3281
3282 return LT.first * 2 * AmortizationCost;
3283 }
3284
3285 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3286 if (Ty->isPtrOrPtrVectorTy())
3287 return LT.first;
3288
3289 if (useNeonVector(Ty)) {
3290 // Check truncating stores and extending loads.
3291 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3292 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3293 if (VT == MVT::v4i8)
3294 return 2;
3295 // Otherwise we need to scalarize.
3296 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3297 }
3298 EVT EltVT = VT.getVectorElementType();
3299 unsigned EltSize = EltVT.getScalarSizeInBits();
3300 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3301 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3302 *Alignment != Align(1))
3303 return LT.first;
3304 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3305 // widening to v4i8, which produces suboptimal results.
3306 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3307 return LT.first;
3308
3309 // Check non-power-of-2 loads/stores for legal vector element types with
3310 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3311 // operations on smaller power-of-2 ops, including ld1/st1.
3312 LLVMContext &C = Ty->getContext();
3314 SmallVector<EVT> TypeWorklist;
3315 TypeWorklist.push_back(VT);
3316 while (!TypeWorklist.empty()) {
3317 EVT CurrVT = TypeWorklist.pop_back_val();
3318 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3319 if (isPowerOf2_32(CurrNumElements)) {
3320 Cost += 1;
3321 continue;
3322 }
3323
3324 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3325 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3326 TypeWorklist.push_back(
3327 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3328 }
3329 return Cost;
3330 }
3331
3332 return LT.first;
3333}
3334
3336 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3337 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3338 bool UseMaskForCond, bool UseMaskForGaps) {
3339 assert(Factor >= 2 && "Invalid interleave factor");
3340 auto *VecVTy = cast<VectorType>(VecTy);
3341
3342 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3344
3345 // Vectorization for masked interleaved accesses is only enabled for scalable
3346 // VF.
3347 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3349
3350 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3351 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3352 auto *SubVecTy =
3353 VectorType::get(VecVTy->getElementType(),
3354 VecVTy->getElementCount().divideCoefficientBy(Factor));
3355
3356 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3357 // Accesses having vector types that are a multiple of 128 bits can be
3358 // matched to more than one ldN/stN instruction.
3359 bool UseScalable;
3360 if (MinElts % Factor == 0 &&
3361 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3362 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3363 }
3364
3365 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3366 Alignment, AddressSpace, CostKind,
3367 UseMaskForCond, UseMaskForGaps);
3368}
3369
3374 for (auto *I : Tys) {
3375 if (!I->isVectorTy())
3376 continue;
3377 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3378 128)
3379 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3380 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3381 }
3382 return Cost;
3383}
3384
3386 return ST->getMaxInterleaveFactor();
3387}
3388
3389// For Falkor, we want to avoid having too many strided loads in a loop since
3390// that can exhaust the HW prefetcher resources. We adjust the unroller
3391// MaxCount preference below to attempt to ensure unrolling doesn't create too
3392// many strided loads.
3393static void
3396 enum { MaxStridedLoads = 7 };
3397 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3398 int StridedLoads = 0;
3399 // FIXME? We could make this more precise by looking at the CFG and
3400 // e.g. not counting loads in each side of an if-then-else diamond.
3401 for (const auto BB : L->blocks()) {
3402 for (auto &I : *BB) {
3403 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3404 if (!LMemI)
3405 continue;
3406
3407 Value *PtrValue = LMemI->getPointerOperand();
3408 if (L->isLoopInvariant(PtrValue))
3409 continue;
3410
3411 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3412 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3413 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3414 continue;
3415
3416 // FIXME? We could take pairing of unrolled load copies into account
3417 // by looking at the AddRec, but we would probably have to limit this
3418 // to loops with no stores or other memory optimization barriers.
3419 ++StridedLoads;
3420 // We've seen enough strided loads that seeing more won't make a
3421 // difference.
3422 if (StridedLoads > MaxStridedLoads / 2)
3423 return StridedLoads;
3424 }
3425 }
3426 return StridedLoads;
3427 };
3428
3429 int StridedLoads = countStridedLoads(L, SE);
3430 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3431 << " strided loads\n");
3432 // Pick the largest power of 2 unroll count that won't result in too many
3433 // strided loads.
3434 if (StridedLoads) {
3435 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3436 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3437 << UP.MaxCount << '\n');
3438 }
3439}
3440
3444 // Enable partial unrolling and runtime unrolling.
3445 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3446
3447 UP.UpperBound = true;
3448
3449 // For inner loop, it is more likely to be a hot one, and the runtime check
3450 // can be promoted out from LICM pass, so the overhead is less, let's try
3451 // a larger threshold to unroll more loops.
3452 if (L->getLoopDepth() > 1)
3453 UP.PartialThreshold *= 2;
3454
3455 // Disable partial & runtime unrolling on -Os.
3457
3458 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3461
3462 // Scan the loop: don't unroll loops with calls as this could prevent
3463 // inlining. Don't unroll vector loops either, as they don't benefit much from
3464 // unrolling.
3465 for (auto *BB : L->getBlocks()) {
3466 for (auto &I : *BB) {
3467 // Don't unroll vectorised loop.
3468 if (I.getType()->isVectorTy())
3469 return;
3470
3471 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3472 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3473 if (!isLoweredToCall(F))
3474 continue;
3475 }
3476 return;
3477 }
3478 }
3479 }
3480
3481 // Enable runtime unrolling for in-order models
3482 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3483 // checking for that case, we can ensure that the default behaviour is
3484 // unchanged
3486 !ST->getSchedModel().isOutOfOrder()) {
3487 UP.Runtime = true;
3488 UP.Partial = true;
3489 UP.UnrollRemainder = true;
3491
3492 UP.UnrollAndJam = true;
3494 }
3495}
3496
3500}
3501
3503 Type *ExpectedType) {
3504 switch (Inst->getIntrinsicID()) {
3505 default:
3506 return nullptr;
3507 case Intrinsic::aarch64_neon_st2:
3508 case Intrinsic::aarch64_neon_st3:
3509 case Intrinsic::aarch64_neon_st4: {
3510 // Create a struct type
3511 StructType *ST = dyn_cast<StructType>(ExpectedType);
3512 if (!ST)
3513 return nullptr;
3514 unsigned NumElts = Inst->arg_size() - 1;
3515 if (ST->getNumElements() != NumElts)
3516 return nullptr;
3517 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3518 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3519 return nullptr;
3520 }
3521 Value *Res = PoisonValue::get(ExpectedType);
3522 IRBuilder<> Builder(Inst);
3523 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3524 Value *L = Inst->getArgOperand(i);
3525 Res = Builder.CreateInsertValue(Res, L, i);
3526 }
3527 return Res;
3528 }
3529 case Intrinsic::aarch64_neon_ld2:
3530 case Intrinsic::aarch64_neon_ld3:
3531 case Intrinsic::aarch64_neon_ld4:
3532 if (Inst->getType() == ExpectedType)
3533 return Inst;
3534 return nullptr;
3535 }
3536}
3537
3539 MemIntrinsicInfo &Info) {
3540 switch (Inst->getIntrinsicID()) {
3541 default:
3542 break;
3543 case Intrinsic::aarch64_neon_ld2:
3544 case Intrinsic::aarch64_neon_ld3:
3545 case Intrinsic::aarch64_neon_ld4:
3546 Info.ReadMem = true;
3547 Info.WriteMem = false;
3548 Info.PtrVal = Inst->getArgOperand(0);
3549 break;
3550 case Intrinsic::aarch64_neon_st2:
3551 case Intrinsic::aarch64_neon_st3:
3552 case Intrinsic::aarch64_neon_st4:
3553 Info.ReadMem = false;
3554 Info.WriteMem = true;
3555 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3556 break;
3557 }
3558
3559 switch (Inst->getIntrinsicID()) {
3560 default:
3561 return false;
3562 case Intrinsic::aarch64_neon_ld2:
3563 case Intrinsic::aarch64_neon_st2:
3564 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3565 break;
3566 case Intrinsic::aarch64_neon_ld3:
3567 case Intrinsic::aarch64_neon_st3:
3568 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3569 break;
3570 case Intrinsic::aarch64_neon_ld4:
3571 case Intrinsic::aarch64_neon_st4:
3572 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3573 break;
3574 }
3575 return true;
3576}
3577
3578/// See if \p I should be considered for address type promotion. We check if \p
3579/// I is a sext with right type and used in memory accesses. If it used in a
3580/// "complex" getelementptr, we allow it to be promoted without finding other
3581/// sext instructions that sign extended the same initial value. A getelementptr
3582/// is considered as "complex" if it has more than 2 operands.
3584 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3585 bool Considerable = false;
3586 AllowPromotionWithoutCommonHeader = false;
3587 if (!isa<SExtInst>(&I))
3588 return false;
3589 Type *ConsideredSExtType =
3590 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3591 if (I.getType() != ConsideredSExtType)
3592 return false;
3593 // See if the sext is the one with the right type and used in at least one
3594 // GetElementPtrInst.
3595 for (const User *U : I.users()) {
3596 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3597 Considerable = true;
3598 // A getelementptr is considered as "complex" if it has more than 2
3599 // operands. We will promote a SExt used in such complex GEP as we
3600 // expect some computation to be merged if they are done on 64 bits.
3601 if (GEPInst->getNumOperands() > 2) {
3602 AllowPromotionWithoutCommonHeader = true;
3603 break;
3604 }
3605 }
3606 }
3607 return Considerable;
3608}
3609
3611 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3612 if (!VF.isScalable())
3613 return true;
3614
3615 Type *Ty = RdxDesc.getRecurrenceType();
3617 return false;
3618
3619 switch (RdxDesc.getRecurrenceKind()) {
3620 case RecurKind::Add:
3621 case RecurKind::FAdd:
3622 case RecurKind::And:
3623 case RecurKind::Or:
3624 case RecurKind::Xor:
3625 case RecurKind::SMin:
3626 case RecurKind::SMax:
3627 case RecurKind::UMin:
3628 case RecurKind::UMax:
3629 case RecurKind::FMin:
3630 case RecurKind::FMax:
3631 case RecurKind::FMulAdd:
3632 case RecurKind::IAnyOf:
3633 case RecurKind::FAnyOf:
3634 return true;
3635 default:
3636 return false;
3637 }
3638}
3639
3642 FastMathFlags FMF,
3644 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3645
3646 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3647 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3648
3649 InstructionCost LegalizationCost = 0;
3650 if (LT.first > 1) {
3651 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3652 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3653 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3654 }
3655
3656 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3657}
3658
3660 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3661 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3662 InstructionCost LegalizationCost = 0;
3663 if (LT.first > 1) {
3664 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3665 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3666 LegalizationCost *= LT.first - 1;
3667 }
3668
3669 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3670 assert(ISD && "Invalid opcode");
3671 // Add the final reduction cost for the legal horizontal reduction
3672 switch (ISD) {
3673 case ISD::ADD:
3674 case ISD::AND:
3675 case ISD::OR:
3676 case ISD::XOR:
3677 case ISD::FADD:
3678 return LegalizationCost + 2;
3679 default:
3681 }
3682}
3683
3686 std::optional<FastMathFlags> FMF,
3689 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3690 InstructionCost BaseCost =
3691 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3692 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3693 // end up vectorizing for more computationally intensive loops.
3694 return BaseCost + FixedVTy->getNumElements();
3695 }
3696
3697 if (Opcode != Instruction::FAdd)
3699
3700 auto *VTy = cast<ScalableVectorType>(ValTy);
3702 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3703 Cost *= getMaxNumElements(VTy->getElementCount());
3704 return Cost;
3705 }
3706
3707 if (isa<ScalableVectorType>(ValTy))
3708 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3709
3710 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3711 MVT MTy = LT.second;
3712 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3713 assert(ISD && "Invalid opcode");
3714
3715 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3716 // instructions as twice a normal vector add, plus 1 for each legalization
3717 // step (LT.first). This is the only arithmetic vector reduction operation for
3718 // which we have an instruction.
3719 // OR, XOR and AND costs should match the codegen from:
3720 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3721 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3722 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3723 static const CostTblEntry CostTblNoPairwise[]{
3724 {ISD::ADD, MVT::v8i8, 2},
3725 {ISD::ADD, MVT::v16i8, 2},
3726 {ISD::ADD, MVT::v4i16, 2},
3727 {ISD::ADD, MVT::v8i16, 2},
3728 {ISD::ADD, MVT::v4i32, 2},
3729 {ISD::ADD, MVT::v2i64, 2},
3730 {ISD::OR, MVT::v8i8, 15},
3731 {ISD::OR, MVT::v16i8, 17},
3732 {ISD::OR, MVT::v4i16, 7},
3733 {ISD::OR, MVT::v8i16, 9},
3734 {ISD::OR, MVT::v2i32, 3},
3735 {ISD::OR, MVT::v4i32, 5},
3736 {ISD::OR, MVT::v2i64, 3},
3737 {ISD::XOR, MVT::v8i8, 15},
3738 {ISD::XOR, MVT::v16i8, 17},
3739 {ISD::XOR, MVT::v4i16, 7},
3740 {ISD::XOR, MVT::v8i16, 9},
3741 {ISD::XOR, MVT::v2i32, 3},
3742 {ISD::XOR, MVT::v4i32, 5},
3743 {ISD::XOR, MVT::v2i64, 3},
3744 {ISD::AND, MVT::v8i8, 15},
3745 {ISD::AND, MVT::v16i8, 17},
3746 {ISD::AND, MVT::v4i16, 7},
3747 {ISD::AND, MVT::v8i16, 9},
3748 {ISD::AND, MVT::v2i32, 3},
3749 {ISD::AND, MVT::v4i32, 5},
3750 {ISD::AND, MVT::v2i64, 3},
3751 };
3752 switch (ISD) {
3753 default:
3754 break;
3755 case ISD::ADD:
3756 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3757 return (LT.first - 1) + Entry->Cost;
3758 break;
3759 case ISD::XOR:
3760 case ISD::AND:
3761 case ISD::OR:
3762 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3763 if (!Entry)
3764 break;
3765 auto *ValVTy = cast<FixedVectorType>(ValTy);
3766 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3767 isPowerOf2_32(ValVTy->getNumElements())) {
3768 InstructionCost ExtraCost = 0;
3769 if (LT.first != 1) {
3770 // Type needs to be split, so there is an extra cost of LT.first - 1
3771 // arithmetic ops.
3772 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3773 MTy.getVectorNumElements());
3774 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3775 ExtraCost *= LT.first - 1;
3776 }
3777 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3778 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3779 return Cost + ExtraCost;
3780 }
3781 break;
3782 }
3783 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3784}
3785
3787 static const CostTblEntry ShuffleTbl[] = {
3788 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3789 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3790 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3791 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3792 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3793 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3794 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3795 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3796 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3797 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3798 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3799 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3800 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3801 };
3802
3803 // The code-generator is currently not able to handle scalable vectors
3804 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3805 // it. This change will be removed when code-generation for these types is
3806 // sufficiently reliable.
3809
3810 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3811 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3813 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3814 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3815 : LT.second;
3816 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3817 InstructionCost LegalizationCost = 0;
3818 if (Index < 0) {
3819 LegalizationCost =
3820 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3822 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3824 }
3825
3826 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3827 // Cost performed on a promoted type.
3828 if (LT.second.getScalarType() == MVT::i1) {
3829 LegalizationCost +=
3830 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3832 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3834 }
3835 const auto *Entry =
3836 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3837 assert(Entry && "Illegal Type for Splice");
3838 LegalizationCost += Entry->Cost;
3839 return LegalizationCost * LT.first;
3840}
3841
3845 ArrayRef<const Value *> Args, const Instruction *CxtI) {
3846 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3847
3848 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3849 // into smaller vectors and sum the cost of each shuffle.
3850 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3851 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3852 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
3853
3854 // Check for LD3/LD4 instructions, which are represented in llvm IR as
3855 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3856 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
3857 // cost than just the load.
3858 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
3861 return std::max<InstructionCost>(1, LT.first / 4);
3862
3863 // Check for ST3/ST4 instructions, which are represented in llvm IR as
3864 // store(interleaving-shuffle). The shuffle cost could potentially be free,
3865 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
3866 // cost than just the store.
3867 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
3869 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
3871 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
3872 return LT.first;
3873
3874 unsigned TpNumElts = Mask.size();
3875 unsigned LTNumElts = LT.second.getVectorNumElements();
3876 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3877 VectorType *NTp =
3878 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3880 for (unsigned N = 0; N < NumVecs; N++) {
3881 SmallVector<int> NMask;
3882 // Split the existing mask into chunks of size LTNumElts. Track the source
3883 // sub-vectors to ensure the result has at most 2 inputs.
3884 unsigned Source1, Source2;
3885 unsigned NumSources = 0;
3886 for (unsigned E = 0; E < LTNumElts; E++) {
3887 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3889 if (MaskElt < 0) {
3891 continue;
3892 }
3893
3894 // Calculate which source from the input this comes from and whether it
3895 // is new to us.
3896 unsigned Source = MaskElt / LTNumElts;
3897 if (NumSources == 0) {
3898 Source1 = Source;
3899 NumSources = 1;
3900 } else if (NumSources == 1 && Source != Source1) {
3901 Source2 = Source;
3902 NumSources = 2;
3903 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3904 NumSources++;
3905 }
3906
3907 // Add to the new mask. For the NumSources>2 case these are not correct,
3908 // but are only used for the modular lane number.
3909 if (Source == Source1)
3910 NMask.push_back(MaskElt % LTNumElts);
3911 else if (Source == Source2)
3912 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3913 else
3914 NMask.push_back(MaskElt % LTNumElts);
3915 }
3916 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3917 // getShuffleCost. If not then cost it using the worst case.
3918 if (NumSources <= 2)
3919 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3921 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
3922 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3923 return ME.value() % LTNumElts == ME.index();
3924 }))
3925 Cost += LTNumElts - 1;
3926 else
3927 Cost += LTNumElts;
3928 }
3929 return Cost;
3930 }
3931
3932 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
3933 // Treat extractsubvector as single op permutation.
3934 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
3935 if (IsExtractSubvector && LT.second.isFixedLengthVector())
3937
3938 // Check for broadcast loads, which are supported by the LD1R instruction.
3939 // In terms of code-size, the shuffle vector is free when a load + dup get
3940 // folded into a LD1R. That's what we check and return here. For performance
3941 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3942 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3943 // that we model the load + dup sequence slightly higher because LD1R is a
3944 // high latency instruction.
3945 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3946 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3947 if (IsLoad && LT.second.isVector() &&
3949 LT.second.getVectorElementCount()))
3950 return 0;
3951 }
3952
3953 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3954 // from the perfect shuffle tables.
3955 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3956 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3957 all_of(Mask, [](int E) { return E < 8; }))
3958 return getPerfectShuffleCost(Mask);
3959
3960 // Check for identity masks, which we can treat as free.
3961 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
3962 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
3963 all_of(enumerate(Mask), [](const auto &M) {
3964 return M.value() < 0 || M.value() == (int)M.index();
3965 }))
3966 return 0;
3967
3968 // Check for other shuffles that are not SK_ kinds but we have native
3969 // instructions for, for example ZIP and UZP.
3970 unsigned Unused;
3971 if (LT.second.isFixedLengthVector() &&
3972 LT.second.getVectorNumElements() == Mask.size() &&
3973 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
3974 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
3975 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
3976 // Check for non-zero lane splats
3977 all_of(drop_begin(Mask),
3978 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
3979 return 1;
3980
3981 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3982 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3983 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3984 static const CostTblEntry ShuffleTbl[] = {
3985 // Broadcast shuffle kinds can be performed with 'dup'.
3986 {TTI::SK_Broadcast, MVT::v8i8, 1},
3987 {TTI::SK_Broadcast, MVT::v16i8, 1},
3988 {TTI::SK_Broadcast, MVT::v4i16, 1},
3989 {TTI::SK_Broadcast, MVT::v8i16, 1},
3990 {TTI::SK_Broadcast, MVT::v2i32, 1},
3991 {TTI::SK_Broadcast, MVT::v4i32, 1},
3992 {TTI::SK_Broadcast, MVT::v2i64, 1},
3993 {TTI::SK_Broadcast, MVT::v4f16, 1},
3994 {TTI::SK_Broadcast, MVT::v8f16, 1},
3995 {TTI::SK_Broadcast, MVT::v2f32, 1},
3996 {TTI::SK_Broadcast, MVT::v4f32, 1},
3997 {TTI::SK_Broadcast, MVT::v2f64, 1},
3998 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3999 // 'zip1/zip2' instructions.
4000 {TTI::SK_Transpose, MVT::v8i8, 1},
4001 {TTI::SK_Transpose, MVT::v16i8, 1},
4002 {TTI::SK_Transpose, MVT::v4i16, 1},
4003 {TTI::SK_Transpose, MVT::v8i16, 1},
4004 {TTI::SK_Transpose, MVT::v2i32, 1},
4005 {TTI::SK_Transpose, MVT::v4i32, 1},
4006 {TTI::SK_Transpose, MVT::v2i64, 1},
4007 {TTI::SK_Transpose, MVT::v4f16, 1},
4008 {TTI::SK_Transpose, MVT::v8f16, 1},
4009 {TTI::SK_Transpose, MVT::v2f32, 1},
4010 {TTI::SK_Transpose, MVT::v4f32, 1},
4011 {TTI::SK_Transpose, MVT::v2f64, 1},
4012 // Select shuffle kinds.
4013 // TODO: handle vXi8/vXi16.
4014 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4015 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4016 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4017 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4018 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4019 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4020 // PermuteSingleSrc shuffle kinds.
4021 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4022 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4023 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4024 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4025 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4026 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4027 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4028 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4029 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4030 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4031 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4032 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4033 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4034 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4035 // Reverse can be lowered with `rev`.
4036 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4037 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4038 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4039 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4040 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4041 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4042 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4043 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4044 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4045 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4046 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4047 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4048 // Splice can all be lowered as `ext`.
4049 {TTI::SK_Splice, MVT::v2i32, 1},
4050 {TTI::SK_Splice, MVT::v4i32, 1},
4051 {TTI::SK_Splice, MVT::v2i64, 1},
4052 {TTI::SK_Splice, MVT::v2f32, 1},
4053 {TTI::SK_Splice, MVT::v4f32, 1},
4054 {TTI::SK_Splice, MVT::v2f64, 1},
4055 {TTI::SK_Splice, MVT::v8f16, 1},
4056 {TTI::SK_Splice, MVT::v8bf16, 1},
4057 {TTI::SK_Splice, MVT::v8i16, 1},
4058 {TTI::SK_Splice, MVT::v16i8, 1},
4059 {TTI::SK_Splice, MVT::v4bf16, 1},
4060 {TTI::SK_Splice, MVT::v4f16, 1},
4061 {TTI::SK_Splice, MVT::v4i16, 1},
4062 {TTI::SK_Splice, MVT::v8i8, 1},
4063 // Broadcast shuffle kinds for scalable vectors
4064 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4065 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4066 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4067 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4068 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4069 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4070 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4071 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4072 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4073 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4074 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4075 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4076 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4077 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4078 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4079 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4080 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4081 // Handle the cases for vector.reverse with scalable vectors
4082 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4083 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4084 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4085 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4086 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4087 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4088 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4089 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4090 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4091 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4092 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4093 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4094 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4095 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4096 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4097 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4098 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4099 };
4100 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4101 return LT.first * Entry->Cost;
4102 }
4103
4104 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4105 return getSpliceCost(Tp, Index);
4106
4107 // Inserting a subvector can often be done with either a D, S or H register
4108 // move, so long as the inserted vector is "aligned".
4109 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4110 LT.second.getSizeInBits() <= 128 && SubTp) {
4111 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4112 if (SubLT.second.isVector()) {
4113 int NumElts = LT.second.getVectorNumElements();
4114 int NumSubElts = SubLT.second.getVectorNumElements();
4115 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4116 return SubLT.first;
4117 }
4118 }
4119
4120 // Restore optimal kind.
4121 if (IsExtractSubvector)
4123 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4124 CxtI);
4125}
4126
4129 const auto &Strides = DenseMap<Value *, const SCEV *>();
4130 for (BasicBlock *BB : TheLoop->blocks()) {
4131 // Scan the instructions in the block and look for addresses that are
4132 // consecutive and decreasing.
4133 for (Instruction &I : *BB) {
4134 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4136 Type *AccessTy = getLoadStoreType(&I);
4137 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4138 /*ShouldCheckWrap=*/false)
4139 .value_or(0) < 0)
4140 return true;
4141 }
4142 }
4143 }
4144 return false;
4145}
4146
4148 if (!ST->hasSVE())
4149 return false;
4150
4151 // We don't currently support vectorisation with interleaving for SVE - with
4152 // such loops we're better off not using tail-folding. This gives us a chance
4153 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4154 if (TFI->IAI->hasGroups())
4155 return false;
4156
4158 if (TFI->LVL->getReductionVars().size())
4159 Required |= TailFoldingOpts::Reductions;
4160 if (TFI->LVL->getFixedOrderRecurrences().size())
4161 Required |= TailFoldingOpts::Recurrences;
4162
4163 // We call this to discover whether any load/store pointers in the loop have
4164 // negative strides. This will require extra work to reverse the loop
4165 // predicate, which may be expensive.
4168 Required |= TailFoldingOpts::Reverse;
4169 if (Required == TailFoldingOpts::Disabled)
4170 Required |= TailFoldingOpts::Simple;
4171
4173 Required))
4174 return false;
4175
4176 // Don't tail-fold for tight loops where we would be better off interleaving
4177 // with an unpredicated loop.
4178 unsigned NumInsns = 0;
4179 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4180 NumInsns += BB->sizeWithoutDebug();
4181 }
4182
4183 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4184 return NumInsns >= SVETailFoldInsnThreshold;
4185}
4186
4189 StackOffset BaseOffset, bool HasBaseReg,
4190 int64_t Scale, unsigned AddrSpace) const {
4191 // Scaling factors are not free at all.
4192 // Operands | Rt Latency
4193 // -------------------------------------------
4194 // Rt, [Xn, Xm] | 4
4195 // -------------------------------------------
4196 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4197 // Rt, [Xn, Wm, <extend> #imm] |
4199 AM.BaseGV = BaseGV;
4200 AM.BaseOffs = BaseOffset.getFixed();
4201 AM.HasBaseReg = HasBaseReg;
4202 AM.Scale = Scale;
4203 AM.ScalableOffset = BaseOffset.getScalable();
4204 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4205 // Scale represents reg2 * scale, thus account for 1 if
4206 // it is not equal to 0 or 1.
4207 return AM.Scale != 0 && AM.Scale != 1;
4208 return -1;
4209}
4210
4212 // For the binary operators (e.g. or) we need to be more careful than
4213 // selects, here we only transform them if they are already at a natural
4214 // break point in the code - the end of a block with an unconditional
4215 // terminator.
4216 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4217 isa<BranchInst>(I->getNextNode()) &&
4218 cast<BranchInst>(I->getNextNode())->isUnconditional())
4219 return true;
4221}
4222
4224 const TargetTransformInfo::LSRCost &C2) {
4225 // AArch64 specific here is adding the number of instructions to the
4226 // comparison (though not as the first consideration, as some targets do)
4227 // along with changing the priority of the base additions.
4228 // TODO: Maybe a more nuanced tradeoff between instruction count
4229 // and number of registers? To be investigated at a later date.
4230 if (EnableLSRCostOpt)
4231 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4232 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4233 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4234 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4235
4237}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
uint64_t IntrinsicInst * II
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:77
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:428
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1628
unsigned countLeadingOnes() const
Definition: APInt.h:1582
void negate()
Negate this APInt in place.
Definition: APInt.h:1429
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1718
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:806