LLVM 19.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61namespace {
62class TailFoldingOption {
63 // These bitfields will only ever be set to something non-zero in operator=,
64 // when setting the -sve-tail-folding option. This option should always be of
65 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
66 // InitialBits is one of (disabled|all|simple). EnableBits represents
67 // additional flags we're enabling, and DisableBits for those flags we're
68 // disabling. The default flag is tracked in the variable NeedsDefault, since
69 // at the time of setting the option we may not know what the default value
70 // for the CPU is.
71 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
72 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
73 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
74
75 // This value needs to be initialised to true in case the user does not
76 // explicitly set the -sve-tail-folding option.
77 bool NeedsDefault = true;
78
79 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
80
81 void setNeedsDefault(bool V) { NeedsDefault = V; }
82
83 void setEnableBit(TailFoldingOpts Bit) {
84 EnableBits |= Bit;
85 DisableBits &= ~Bit;
86 }
87
88 void setDisableBit(TailFoldingOpts Bit) {
89 EnableBits &= ~Bit;
90 DisableBits |= Bit;
91 }
92
93 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
94 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
95
96 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
97 "Initial bits should only include one of "
98 "(disabled|all|simple|default)");
99 Bits = NeedsDefault ? DefaultBits : InitialBits;
100 Bits |= EnableBits;
101 Bits &= ~DisableBits;
102
103 return Bits;
104 }
105
106 void reportError(std::string Opt) {
107 errs() << "invalid argument '" << Opt
108 << "' to -sve-tail-folding=; the option should be of the form\n"
109 " (disabled|all|default|simple)[+(reductions|recurrences"
110 "|reverse|noreductions|norecurrences|noreverse)]\n";
111 report_fatal_error("Unrecognised tail-folding option");
112 }
113
114public:
115
116 void operator=(const std::string &Val) {
117 // If the user explicitly sets -sve-tail-folding= then treat as an error.
118 if (Val.empty()) {
119 reportError("");
120 return;
121 }
122
123 // Since the user is explicitly setting the option we don't automatically
124 // need the default unless they require it.
125 setNeedsDefault(false);
126
127 SmallVector<StringRef, 4> TailFoldTypes;
128 StringRef(Val).split(TailFoldTypes, '+', -1, false);
129
130 unsigned StartIdx = 1;
131 if (TailFoldTypes[0] == "disabled")
132 setInitialBits(TailFoldingOpts::Disabled);
133 else if (TailFoldTypes[0] == "all")
134 setInitialBits(TailFoldingOpts::All);
135 else if (TailFoldTypes[0] == "default")
136 setNeedsDefault(true);
137 else if (TailFoldTypes[0] == "simple")
138 setInitialBits(TailFoldingOpts::Simple);
139 else {
140 StartIdx = 0;
141 setInitialBits(TailFoldingOpts::Disabled);
142 }
143
144 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
145 if (TailFoldTypes[I] == "reductions")
146 setEnableBit(TailFoldingOpts::Reductions);
147 else if (TailFoldTypes[I] == "recurrences")
148 setEnableBit(TailFoldingOpts::Recurrences);
149 else if (TailFoldTypes[I] == "reverse")
150 setEnableBit(TailFoldingOpts::Reverse);
151 else if (TailFoldTypes[I] == "noreductions")
152 setDisableBit(TailFoldingOpts::Reductions);
153 else if (TailFoldTypes[I] == "norecurrences")
154 setDisableBit(TailFoldingOpts::Recurrences);
155 else if (TailFoldTypes[I] == "noreverse")
156 setDisableBit(TailFoldingOpts::Reverse);
157 else
158 reportError(Val);
159 }
160 }
161
162 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
163 return (getBits(DefaultBits) & Required) == Required;
164 }
165};
166} // namespace
167
168TailFoldingOption TailFoldingOptionLoc;
169
171 "sve-tail-folding",
172 cl::desc(
173 "Control the use of vectorisation using tail-folding for SVE where the"
174 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
175 "\ndisabled (Initial) No loop types will vectorize using "
176 "tail-folding"
177 "\ndefault (Initial) Uses the default tail-folding settings for "
178 "the target CPU"
179 "\nall (Initial) All legal loop types will vectorize using "
180 "tail-folding"
181 "\nsimple (Initial) Use tail-folding for simple loops (not "
182 "reductions or recurrences)"
183 "\nreductions Use tail-folding for loops containing reductions"
184 "\nnoreductions Inverse of above"
185 "\nrecurrences Use tail-folding for loops containing fixed order "
186 "recurrences"
187 "\nnorecurrences Inverse of above"
188 "\nreverse Use tail-folding for loops requiring reversed "
189 "predicates"
190 "\nnoreverse Inverse of above"),
192
193// Experimental option that will only be fully functional when the
194// code-generator is changed to use SVE instead of NEON for all fixed-width
195// operations.
197 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
198
199// Experimental option that will only be fully functional when the cost-model
200// and code-generator have been changed to avoid using scalable vector
201// instructions that are not legal in streaming SVE mode.
203 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
204
205static bool isSMEABIRoutineCall(const CallInst &CI) {
206 const auto *F = CI.getCalledFunction();
207 return F && StringSwitch<bool>(F->getName())
208 .Case("__arm_sme_state", true)
209 .Case("__arm_tpidr2_save", true)
210 .Case("__arm_tpidr2_restore", true)
211 .Case("__arm_za_disable", true)
212 .Default(false);
213}
214
215/// Returns true if the function has explicit operations that can only be
216/// lowered using incompatible instructions for the selected mode. This also
217/// returns true if the function F may use or modify ZA state.
219 for (const BasicBlock &BB : *F) {
220 for (const Instruction &I : BB) {
221 // Be conservative for now and assume that any call to inline asm or to
222 // intrinsics could could result in non-streaming ops (e.g. calls to
223 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
224 // all native LLVM instructions can be lowered to compatible instructions.
225 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
226 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
227 isSMEABIRoutineCall(cast<CallInst>(I))))
228 return true;
229 }
230 }
231 return false;
232}
233
235 const Function *Callee) const {
236 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
237
238 // When inlining, we should consider the body of the function, not the
239 // interface.
240 if (CalleeAttrs.hasStreamingBody()) {
241 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
242 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
243 }
244
245 if (CalleeAttrs.isNewZA())
246 return false;
247
248 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
249 CallerAttrs.requiresSMChange(CalleeAttrs)) {
250 if (hasPossibleIncompatibleOps(Callee))
251 return false;
252 }
253
254 const TargetMachine &TM = getTLI()->getTargetMachine();
255
256 const FeatureBitset &CallerBits =
257 TM.getSubtargetImpl(*Caller)->getFeatureBits();
258 const FeatureBitset &CalleeBits =
259 TM.getSubtargetImpl(*Callee)->getFeatureBits();
260
261 // Inline a callee if its target-features are a subset of the callers
262 // target-features.
263 return (CallerBits & CalleeBits) == CalleeBits;
264}
265
267 const Function *Caller, const Function *Callee,
268 const ArrayRef<Type *> &Types) const {
269 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
270 return false;
271
272 // We need to ensure that argument promotion does not attempt to promote
273 // pointers to fixed-length vector types larger than 128 bits like
274 // <8 x float> (and pointers to aggregate types which have such fixed-length
275 // vector type members) into the values of the pointees. Such vector types
276 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
277 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
278 // types can be safely treated as 128-bit NEON types and they cannot be
279 // distinguished in IR.
280 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
281 auto FVTy = dyn_cast<FixedVectorType>(Ty);
282 return FVTy &&
283 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
284 }))
285 return false;
286
287 return true;
288}
289
290unsigned
292 unsigned DefaultCallPenalty) const {
293 // This function calculates a penalty for executing Call in F.
294 //
295 // There are two ways this function can be called:
296 // (1) F:
297 // call from F -> G (the call here is Call)
298 //
299 // For (1), Call.getCaller() == F, so it will always return a high cost if
300 // a streaming-mode change is required (thus promoting the need to inline the
301 // function)
302 //
303 // (2) F:
304 // call from F -> G (the call here is not Call)
305 // G:
306 // call from G -> H (the call here is Call)
307 //
308 // For (2), if after inlining the body of G into F the call to H requires a
309 // streaming-mode change, and the call to G from F would also require a
310 // streaming-mode change, then there is benefit to do the streaming-mode
311 // change only once and avoid inlining of G into F.
312 SMEAttrs FAttrs(*F);
313 SMEAttrs CalleeAttrs(Call);
314 if (FAttrs.requiresSMChange(CalleeAttrs)) {
315 if (F == Call.getCaller()) // (1)
316 return CallPenaltyChangeSM * DefaultCallPenalty;
317 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
318 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
319 }
320
321 return DefaultCallPenalty;
322}
323
328 ST->isNeonAvailable());
329}
330
331/// Calculate the cost of materializing a 64-bit value. This helper
332/// method might only calculate a fraction of a larger immediate. Therefore it
333/// is valid to return a cost of ZERO.
335 // Check if the immediate can be encoded within an instruction.
336 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
337 return 0;
338
339 if (Val < 0)
340 Val = ~Val;
341
342 // Calculate how many moves we will need to materialize this constant.
345 return Insn.size();
346}
347
348/// Calculate the cost of materializing the given constant.
351 assert(Ty->isIntegerTy());
352
353 unsigned BitSize = Ty->getPrimitiveSizeInBits();
354 if (BitSize == 0)
355 return ~0U;
356
357 // Sign-extend all constants to a multiple of 64-bit.
358 APInt ImmVal = Imm;
359 if (BitSize & 0x3f)
360 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
361
362 // Split the constant into 64-bit chunks and calculate the cost for each
363 // chunk.
365 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
366 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
367 int64_t Val = Tmp.getSExtValue();
368 Cost += getIntImmCost(Val);
369 }
370 // We need at least one instruction to materialze the constant.
371 return std::max<InstructionCost>(1, Cost);
372}
373
375 const APInt &Imm, Type *Ty,
377 Instruction *Inst) {
378 assert(Ty->isIntegerTy());
379
380 unsigned BitSize = Ty->getPrimitiveSizeInBits();
381 // There is no cost model for constants with a bit size of 0. Return TCC_Free
382 // here, so that constant hoisting will ignore this constant.
383 if (BitSize == 0)
384 return TTI::TCC_Free;
385
386 unsigned ImmIdx = ~0U;
387 switch (Opcode) {
388 default:
389 return TTI::TCC_Free;
390 case Instruction::GetElementPtr:
391 // Always hoist the base address of a GetElementPtr.
392 if (Idx == 0)
393 return 2 * TTI::TCC_Basic;
394 return TTI::TCC_Free;
395 case Instruction::Store:
396 ImmIdx = 0;
397 break;
398 case Instruction::Add:
399 case Instruction::Sub:
400 case Instruction::Mul:
401 case Instruction::UDiv:
402 case Instruction::SDiv:
403 case Instruction::URem:
404 case Instruction::SRem:
405 case Instruction::And:
406 case Instruction::Or:
407 case Instruction::Xor:
408 case Instruction::ICmp:
409 ImmIdx = 1;
410 break;
411 // Always return TCC_Free for the shift value of a shift instruction.
412 case Instruction::Shl:
413 case Instruction::LShr:
414 case Instruction::AShr:
415 if (Idx == 1)
416 return TTI::TCC_Free;
417 break;
418 case Instruction::Trunc:
419 case Instruction::ZExt:
420 case Instruction::SExt:
421 case Instruction::IntToPtr:
422 case Instruction::PtrToInt:
423 case Instruction::BitCast:
424 case Instruction::PHI:
425 case Instruction::Call:
426 case Instruction::Select:
427 case Instruction::Ret:
428 case Instruction::Load:
429 break;
430 }
431
432 if (Idx == ImmIdx) {
433 int NumConstants = (BitSize + 63) / 64;
435 return (Cost <= NumConstants * TTI::TCC_Basic)
436 ? static_cast<int>(TTI::TCC_Free)
437 : Cost;
438 }
440}
441
444 const APInt &Imm, Type *Ty,
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 // Most (all?) AArch64 intrinsics do not support folding immediates into the
455 // selected instruction, so we compute the materialization cost for the
456 // immediate directly.
457 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
459
460 switch (IID) {
461 default:
462 return TTI::TCC_Free;
463 case Intrinsic::sadd_with_overflow:
464 case Intrinsic::uadd_with_overflow:
465 case Intrinsic::ssub_with_overflow:
466 case Intrinsic::usub_with_overflow:
467 case Intrinsic::smul_with_overflow:
468 case Intrinsic::umul_with_overflow:
469 if (Idx == 1) {
470 int NumConstants = (BitSize + 63) / 64;
472 return (Cost <= NumConstants * TTI::TCC_Basic)
473 ? static_cast<int>(TTI::TCC_Free)
474 : Cost;
475 }
476 break;
477 case Intrinsic::experimental_stackmap:
478 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
479 return TTI::TCC_Free;
480 break;
481 case Intrinsic::experimental_patchpoint_void:
482 case Intrinsic::experimental_patchpoint:
483 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
484 return TTI::TCC_Free;
485 break;
486 case Intrinsic::experimental_gc_statepoint:
487 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
488 return TTI::TCC_Free;
489 break;
490 }
492}
493
496 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
497 if (TyWidth == 32 || TyWidth == 64)
499 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
500 return TTI::PSK_Software;
501}
502
503static bool isUnpackedVectorVT(EVT VecVT) {
504 return VecVT.isScalableVector() &&
506}
507
511 auto *RetTy = ICA.getReturnType();
512 switch (ICA.getID()) {
513 case Intrinsic::umin:
514 case Intrinsic::umax:
515 case Intrinsic::smin:
516 case Intrinsic::smax: {
517 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
518 MVT::v8i16, MVT::v2i32, MVT::v4i32,
519 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
520 MVT::nxv2i64};
522 // v2i64 types get converted to cmp+bif hence the cost of 2
523 if (LT.second == MVT::v2i64)
524 return LT.first * 2;
525 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
526 return LT.first;
527 break;
528 }
529 case Intrinsic::sadd_sat:
530 case Intrinsic::ssub_sat:
531 case Intrinsic::uadd_sat:
532 case Intrinsic::usub_sat: {
533 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
534 MVT::v8i16, MVT::v2i32, MVT::v4i32,
535 MVT::v2i64};
537 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
538 // need to extend the type, as it uses shr(qadd(shl, shl)).
539 unsigned Instrs =
540 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
541 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
542 return LT.first * Instrs;
543 break;
544 }
545 case Intrinsic::abs: {
546 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
547 MVT::v8i16, MVT::v2i32, MVT::v4i32,
548 MVT::v2i64};
550 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
551 return LT.first;
552 break;
553 }
554 case Intrinsic::bswap: {
555 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
556 MVT::v4i32, MVT::v2i64};
558 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
559 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
560 return LT.first;
561 break;
562 }
563 case Intrinsic::experimental_stepvector: {
564 InstructionCost Cost = 1; // Cost of the `index' instruction
566 // Legalisation of illegal vectors involves an `index' instruction plus
567 // (LT.first - 1) vector adds.
568 if (LT.first > 1) {
569 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
570 InstructionCost AddCost =
571 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
572 Cost += AddCost * (LT.first - 1);
573 }
574 return Cost;
575 }
576 case Intrinsic::vector_extract:
577 case Intrinsic::vector_insert: {
578 // If both the vector and subvector types are legal types and the index
579 // is 0, then this should be a no-op or simple operation; return a
580 // relatively low cost.
581
582 // If arguments aren't actually supplied, then we cannot determine the
583 // value of the index. We also want to skip predicate types.
584 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
586 break;
587
588 LLVMContext &C = RetTy->getContext();
589 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
590 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
591 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
592 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
593 // Skip this if either the vector or subvector types are unpacked
594 // SVE types; they may get lowered to stack stores and loads.
595 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
596 break;
597
599 getTLI()->getTypeConversion(C, SubVecVT);
601 getTLI()->getTypeConversion(C, VecVT);
602 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
603 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
604 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
605 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
606 return TTI::TCC_Free;
607 break;
608 }
609 case Intrinsic::bitreverse: {
610 static const CostTblEntry BitreverseTbl[] = {
611 {Intrinsic::bitreverse, MVT::i32, 1},
612 {Intrinsic::bitreverse, MVT::i64, 1},
613 {Intrinsic::bitreverse, MVT::v8i8, 1},
614 {Intrinsic::bitreverse, MVT::v16i8, 1},
615 {Intrinsic::bitreverse, MVT::v4i16, 2},
616 {Intrinsic::bitreverse, MVT::v8i16, 2},
617 {Intrinsic::bitreverse, MVT::v2i32, 2},
618 {Intrinsic::bitreverse, MVT::v4i32, 2},
619 {Intrinsic::bitreverse, MVT::v1i64, 2},
620 {Intrinsic::bitreverse, MVT::v2i64, 2},
621 };
622 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
623 const auto *Entry =
624 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
625 if (Entry) {
626 // Cost Model is using the legal type(i32) that i8 and i16 will be
627 // converted to +1 so that we match the actual lowering cost
628 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
629 TLI->getValueType(DL, RetTy, true) == MVT::i16)
630 return LegalisationCost.first * Entry->Cost + 1;
631
632 return LegalisationCost.first * Entry->Cost;
633 }
634 break;
635 }
636 case Intrinsic::ctpop: {
637 if (!ST->hasNEON()) {
638 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
639 return getTypeLegalizationCost(RetTy).first * 12;
640 }
641 static const CostTblEntry CtpopCostTbl[] = {
642 {ISD::CTPOP, MVT::v2i64, 4},
643 {ISD::CTPOP, MVT::v4i32, 3},
644 {ISD::CTPOP, MVT::v8i16, 2},
645 {ISD::CTPOP, MVT::v16i8, 1},
646 {ISD::CTPOP, MVT::i64, 4},
647 {ISD::CTPOP, MVT::v2i32, 3},
648 {ISD::CTPOP, MVT::v4i16, 2},
649 {ISD::CTPOP, MVT::v8i8, 1},
650 {ISD::CTPOP, MVT::i32, 5},
651 };
653 MVT MTy = LT.second;
654 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
655 // Extra cost of +1 when illegal vector types are legalized by promoting
656 // the integer type.
657 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
658 RetTy->getScalarSizeInBits()
659 ? 1
660 : 0;
661 return LT.first * Entry->Cost + ExtraCost;
662 }
663 break;
664 }
665 case Intrinsic::sadd_with_overflow:
666 case Intrinsic::uadd_with_overflow:
667 case Intrinsic::ssub_with_overflow:
668 case Intrinsic::usub_with_overflow:
669 case Intrinsic::smul_with_overflow:
670 case Intrinsic::umul_with_overflow: {
671 static const CostTblEntry WithOverflowCostTbl[] = {
672 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
673 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
674 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
675 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
676 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
677 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
678 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
679 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
680 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
681 {Intrinsic::usub_with_overflow, MVT::i8, 3},
682 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
683 {Intrinsic::usub_with_overflow, MVT::i16, 3},
684 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
685 {Intrinsic::usub_with_overflow, MVT::i32, 1},
686 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
687 {Intrinsic::usub_with_overflow, MVT::i64, 1},
688 {Intrinsic::smul_with_overflow, MVT::i8, 5},
689 {Intrinsic::umul_with_overflow, MVT::i8, 4},
690 {Intrinsic::smul_with_overflow, MVT::i16, 5},
691 {Intrinsic::umul_with_overflow, MVT::i16, 4},
692 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
693 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
694 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
695 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
696 };
697 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
698 if (MTy.isSimple())
699 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
700 MTy.getSimpleVT()))
701 return Entry->Cost;
702 break;
703 }
704 case Intrinsic::fptosi_sat:
705 case Intrinsic::fptoui_sat: {
706 if (ICA.getArgTypes().empty())
707 break;
708 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
709 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
710 EVT MTy = TLI->getValueType(DL, RetTy);
711 // Check for the legal types, which are where the size of the input and the
712 // output are the same, or we are using cvt f64->i32 or f32->i64.
713 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
714 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
715 LT.second == MVT::v2f64) &&
716 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
717 (LT.second == MVT::f64 && MTy == MVT::i32) ||
718 (LT.second == MVT::f32 && MTy == MVT::i64)))
719 return LT.first;
720 // Similarly for fp16 sizes
721 if (ST->hasFullFP16() &&
722 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
723 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
724 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
725 return LT.first;
726
727 // Otherwise we use a legal convert followed by a min+max
728 if ((LT.second.getScalarType() == MVT::f32 ||
729 LT.second.getScalarType() == MVT::f64 ||
730 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
731 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
732 Type *LegalTy =
733 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
734 if (LT.second.isVector())
735 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
737 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
738 LegalTy, {LegalTy, LegalTy});
740 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
741 LegalTy, {LegalTy, LegalTy});
743 return LT.first * Cost;
744 }
745 break;
746 }
747 case Intrinsic::fshl:
748 case Intrinsic::fshr: {
749 if (ICA.getArgs().empty())
750 break;
751
752 // TODO: Add handling for fshl where third argument is not a constant.
753 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
754 if (!OpInfoZ.isConstant())
755 break;
756
757 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
758 if (OpInfoZ.isUniform()) {
759 // FIXME: The costs could be lower if the codegen is better.
760 static const CostTblEntry FshlTbl[] = {
761 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
762 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
763 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
764 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
765 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
766 // to avoid having to duplicate the costs.
767 const auto *Entry =
768 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
769 if (Entry)
770 return LegalisationCost.first * Entry->Cost;
771 }
772
773 auto TyL = getTypeLegalizationCost(RetTy);
774 if (!RetTy->isIntegerTy())
775 break;
776
777 // Estimate cost manually, as types like i8 and i16 will get promoted to
778 // i32 and CostTableLookup will ignore the extra conversion cost.
779 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
780 RetTy->getScalarSizeInBits() < 64) ||
781 (RetTy->getScalarSizeInBits() % 64 != 0);
782 unsigned ExtraCost = HigherCost ? 1 : 0;
783 if (RetTy->getScalarSizeInBits() == 32 ||
784 RetTy->getScalarSizeInBits() == 64)
785 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
786 // extr instruction.
787 else if (HigherCost)
788 ExtraCost = 1;
789 else
790 break;
791 return TyL.first + ExtraCost;
792 }
793 case Intrinsic::get_active_lane_mask: {
794 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
795 if (RetTy) {
796 EVT RetVT = getTLI()->getValueType(DL, RetTy);
797 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
798 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
799 !getTLI()->isTypeLegal(RetVT)) {
800 // We don't have enough context at this point to determine if the mask
801 // is going to be kept live after the block, which will force the vXi1
802 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
803 // For now, we just assume the vectorizer created this intrinsic and
804 // the result will be the input for a PHI. In this case the cost will
805 // be extremely high for fixed-width vectors.
806 // NOTE: getScalarizationOverhead returns a cost that's far too
807 // pessimistic for the actual generated codegen. In reality there are
808 // two instructions generated per lane.
809 return RetTy->getNumElements() * 2;
810 }
811 }
812 break;
813 }
814 default:
815 break;
816 }
818}
819
820/// The function will remove redundant reinterprets casting in the presence
821/// of the control flow
822static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
823 IntrinsicInst &II) {
825 auto RequiredType = II.getType();
826
827 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
828 assert(PN && "Expected Phi Node!");
829
830 // Don't create a new Phi unless we can remove the old one.
831 if (!PN->hasOneUse())
832 return std::nullopt;
833
834 for (Value *IncValPhi : PN->incoming_values()) {
835 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
836 if (!Reinterpret ||
837 Reinterpret->getIntrinsicID() !=
838 Intrinsic::aarch64_sve_convert_to_svbool ||
839 RequiredType != Reinterpret->getArgOperand(0)->getType())
840 return std::nullopt;
841 }
842
843 // Create the new Phi
844 IC.Builder.SetInsertPoint(PN);
845 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
846 Worklist.push_back(PN);
847
848 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
849 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
850 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
851 Worklist.push_back(Reinterpret);
852 }
853
854 // Cleanup Phi Node and reinterprets
855 return IC.replaceInstUsesWith(II, NPN);
856}
857
858// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
859// => (binop (pred) (from_svbool _) (from_svbool _))
860//
861// The above transformation eliminates a `to_svbool` in the predicate
862// operand of bitwise operation `binop` by narrowing the vector width of
863// the operation. For example, it would convert a `<vscale x 16 x i1>
864// and` into a `<vscale x 4 x i1> and`. This is profitable because
865// to_svbool must zero the new lanes during widening, whereas
866// from_svbool is free.
867static std::optional<Instruction *>
869 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
870 if (!BinOp)
871 return std::nullopt;
872
873 auto IntrinsicID = BinOp->getIntrinsicID();
874 switch (IntrinsicID) {
875 case Intrinsic::aarch64_sve_and_z:
876 case Intrinsic::aarch64_sve_bic_z:
877 case Intrinsic::aarch64_sve_eor_z:
878 case Intrinsic::aarch64_sve_nand_z:
879 case Intrinsic::aarch64_sve_nor_z:
880 case Intrinsic::aarch64_sve_orn_z:
881 case Intrinsic::aarch64_sve_orr_z:
882 break;
883 default:
884 return std::nullopt;
885 }
886
887 auto BinOpPred = BinOp->getOperand(0);
888 auto BinOpOp1 = BinOp->getOperand(1);
889 auto BinOpOp2 = BinOp->getOperand(2);
890
891 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
892 if (!PredIntr ||
893 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
894 return std::nullopt;
895
896 auto PredOp = PredIntr->getOperand(0);
897 auto PredOpTy = cast<VectorType>(PredOp->getType());
898 if (PredOpTy != II.getType())
899 return std::nullopt;
900
901 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
902 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
903 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
904 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
905 if (BinOpOp1 == BinOpOp2)
906 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
907 else
908 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
909 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
910
911 auto NarrowedBinOp =
912 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
913 return IC.replaceInstUsesWith(II, NarrowedBinOp);
914}
915
916static std::optional<Instruction *>
918 // If the reinterpret instruction operand is a PHI Node
919 if (isa<PHINode>(II.getArgOperand(0)))
920 return processPhiNode(IC, II);
921
922 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
923 return BinOpCombine;
924
925 // Ignore converts to/from svcount_t.
926 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
927 isa<TargetExtType>(II.getType()))
928 return std::nullopt;
929
930 SmallVector<Instruction *, 32> CandidatesForRemoval;
931 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
932
933 const auto *IVTy = cast<VectorType>(II.getType());
934
935 // Walk the chain of conversions.
936 while (Cursor) {
937 // If the type of the cursor has fewer lanes than the final result, zeroing
938 // must take place, which breaks the equivalence chain.
939 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
940 if (CursorVTy->getElementCount().getKnownMinValue() <
941 IVTy->getElementCount().getKnownMinValue())
942 break;
943
944 // If the cursor has the same type as I, it is a viable replacement.
945 if (Cursor->getType() == IVTy)
946 EarliestReplacement = Cursor;
947
948 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
949
950 // If this is not an SVE conversion intrinsic, this is the end of the chain.
951 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
952 Intrinsic::aarch64_sve_convert_to_svbool ||
953 IntrinsicCursor->getIntrinsicID() ==
954 Intrinsic::aarch64_sve_convert_from_svbool))
955 break;
956
957 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
958 Cursor = IntrinsicCursor->getOperand(0);
959 }
960
961 // If no viable replacement in the conversion chain was found, there is
962 // nothing to do.
963 if (!EarliestReplacement)
964 return std::nullopt;
965
966 return IC.replaceInstUsesWith(II, EarliestReplacement);
967}
968
969static bool isAllActivePredicate(Value *Pred) {
970 // Look through convert.from.svbool(convert.to.svbool(...) chain.
971 Value *UncastedPred;
972 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
973 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
974 m_Value(UncastedPred)))))
975 // If the predicate has the same or less lanes than the uncasted
976 // predicate then we know the casting has no effect.
977 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
978 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
979 Pred = UncastedPred;
980
981 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
982 m_ConstantInt<AArch64SVEPredPattern::all>()));
983}
984
985static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
986 IntrinsicInst &II) {
987 // svsel(ptrue, x, y) => x
988 auto *OpPredicate = II.getOperand(0);
989 if (isAllActivePredicate(OpPredicate))
990 return IC.replaceInstUsesWith(II, II.getOperand(1));
991
992 auto Select =
993 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
994 return IC.replaceInstUsesWith(II, Select);
995}
996
997static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
998 IntrinsicInst &II) {
999 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1000 if (!Pg)
1001 return std::nullopt;
1002
1003 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1004 return std::nullopt;
1005
1006 const auto PTruePattern =
1007 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1008 if (PTruePattern != AArch64SVEPredPattern::vl1)
1009 return std::nullopt;
1010
1011 // The intrinsic is inserting into lane zero so use an insert instead.
1012 auto *IdxTy = Type::getInt64Ty(II.getContext());
1013 auto *Insert = InsertElementInst::Create(
1014 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1015 Insert->insertBefore(&II);
1016 Insert->takeName(&II);
1017
1018 return IC.replaceInstUsesWith(II, Insert);
1019}
1020
1021static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1022 IntrinsicInst &II) {
1023 // Replace DupX with a regular IR splat.
1024 auto *RetTy = cast<ScalableVectorType>(II.getType());
1025 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1026 II.getArgOperand(0));
1027 Splat->takeName(&II);
1028 return IC.replaceInstUsesWith(II, Splat);
1029}
1030
1031static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1032 IntrinsicInst &II) {
1033 LLVMContext &Ctx = II.getContext();
1034
1035 // Check that the predicate is all active
1036 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1037 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1038 return std::nullopt;
1039
1040 const auto PTruePattern =
1041 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1042 if (PTruePattern != AArch64SVEPredPattern::all)
1043 return std::nullopt;
1044
1045 // Check that we have a compare of zero..
1046 auto *SplatValue =
1047 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1048 if (!SplatValue || !SplatValue->isZero())
1049 return std::nullopt;
1050
1051 // ..against a dupq
1052 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1053 if (!DupQLane ||
1054 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1055 return std::nullopt;
1056
1057 // Where the dupq is a lane 0 replicate of a vector insert
1058 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1059 return std::nullopt;
1060
1061 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1062 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1063 return std::nullopt;
1064
1065 // Where the vector insert is a fixed constant vector insert into undef at
1066 // index zero
1067 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1068 return std::nullopt;
1069
1070 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1071 return std::nullopt;
1072
1073 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1074 if (!ConstVec)
1075 return std::nullopt;
1076
1077 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1078 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1079 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1080 return std::nullopt;
1081
1082 unsigned NumElts = VecTy->getNumElements();
1083 unsigned PredicateBits = 0;
1084
1085 // Expand intrinsic operands to a 16-bit byte level predicate
1086 for (unsigned I = 0; I < NumElts; ++I) {
1087 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1088 if (!Arg)
1089 return std::nullopt;
1090 if (!Arg->isZero())
1091 PredicateBits |= 1 << (I * (16 / NumElts));
1092 }
1093
1094 // If all bits are zero bail early with an empty predicate
1095 if (PredicateBits == 0) {
1096 auto *PFalse = Constant::getNullValue(II.getType());
1097 PFalse->takeName(&II);
1098 return IC.replaceInstUsesWith(II, PFalse);
1099 }
1100
1101 // Calculate largest predicate type used (where byte predicate is largest)
1102 unsigned Mask = 8;
1103 for (unsigned I = 0; I < 16; ++I)
1104 if ((PredicateBits & (1 << I)) != 0)
1105 Mask |= (I % 8);
1106
1107 unsigned PredSize = Mask & -Mask;
1108 auto *PredType = ScalableVectorType::get(
1109 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1110
1111 // Ensure all relevant bits are set
1112 for (unsigned I = 0; I < 16; I += PredSize)
1113 if ((PredicateBits & (1 << I)) == 0)
1114 return std::nullopt;
1115
1116 auto *PTruePat =
1117 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1118 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1119 {PredType}, {PTruePat});
1120 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1121 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1122 auto *ConvertFromSVBool =
1123 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1124 {II.getType()}, {ConvertToSVBool});
1125
1126 ConvertFromSVBool->takeName(&II);
1127 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1128}
1129
1130static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1131 IntrinsicInst &II) {
1132 Value *Pg = II.getArgOperand(0);
1133 Value *Vec = II.getArgOperand(1);
1134 auto IntrinsicID = II.getIntrinsicID();
1135 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1136
1137 // lastX(splat(X)) --> X
1138 if (auto *SplatVal = getSplatValue(Vec))
1139 return IC.replaceInstUsesWith(II, SplatVal);
1140
1141 // If x and/or y is a splat value then:
1142 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1143 Value *LHS, *RHS;
1144 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1145 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1146 auto *OldBinOp = cast<BinaryOperator>(Vec);
1147 auto OpC = OldBinOp->getOpcode();
1148 auto *NewLHS =
1149 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1150 auto *NewRHS =
1151 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1153 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1154 return IC.replaceInstUsesWith(II, NewBinOp);
1155 }
1156 }
1157
1158 auto *C = dyn_cast<Constant>(Pg);
1159 if (IsAfter && C && C->isNullValue()) {
1160 // The intrinsic is extracting lane 0 so use an extract instead.
1161 auto *IdxTy = Type::getInt64Ty(II.getContext());
1162 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1163 Extract->insertBefore(&II);
1164 Extract->takeName(&II);
1165 return IC.replaceInstUsesWith(II, Extract);
1166 }
1167
1168 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1169 if (!IntrPG)
1170 return std::nullopt;
1171
1172 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1173 return std::nullopt;
1174
1175 const auto PTruePattern =
1176 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1177
1178 // Can the intrinsic's predicate be converted to a known constant index?
1179 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1180 if (!MinNumElts)
1181 return std::nullopt;
1182
1183 unsigned Idx = MinNumElts - 1;
1184 // Increment the index if extracting the element after the last active
1185 // predicate element.
1186 if (IsAfter)
1187 ++Idx;
1188
1189 // Ignore extracts whose index is larger than the known minimum vector
1190 // length. NOTE: This is an artificial constraint where we prefer to
1191 // maintain what the user asked for until an alternative is proven faster.
1192 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1193 if (Idx >= PgVTy->getMinNumElements())
1194 return std::nullopt;
1195
1196 // The intrinsic is extracting a fixed lane so use an extract instead.
1197 auto *IdxTy = Type::getInt64Ty(II.getContext());
1198 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1199 Extract->insertBefore(&II);
1200 Extract->takeName(&II);
1201 return IC.replaceInstUsesWith(II, Extract);
1202}
1203
1204static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1205 IntrinsicInst &II) {
1206 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1207 // integer variant across a variety of micro-architectures. Replace scalar
1208 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1209 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1210 // depending on the micro-architecture, but has been observed as generally
1211 // being faster, particularly when the CLAST[AB] op is a loop-carried
1212 // dependency.
1213 Value *Pg = II.getArgOperand(0);
1214 Value *Fallback = II.getArgOperand(1);
1215 Value *Vec = II.getArgOperand(2);
1216 Type *Ty = II.getType();
1217
1218 if (!Ty->isIntegerTy())
1219 return std::nullopt;
1220
1221 Type *FPTy;
1222 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1223 default:
1224 return std::nullopt;
1225 case 16:
1226 FPTy = IC.Builder.getHalfTy();
1227 break;
1228 case 32:
1229 FPTy = IC.Builder.getFloatTy();
1230 break;
1231 case 64:
1232 FPTy = IC.Builder.getDoubleTy();
1233 break;
1234 }
1235
1236 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1237 auto *FPVTy = VectorType::get(
1238 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1239 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1240 auto *FPII = IC.Builder.CreateIntrinsic(
1241 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1242 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1243 return IC.replaceInstUsesWith(II, FPIItoInt);
1244}
1245
1246static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1247 IntrinsicInst &II) {
1248 LLVMContext &Ctx = II.getContext();
1249 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1250 // can work with RDFFR_PP for ptest elimination.
1251 auto *AllPat =
1252 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1253 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1254 {II.getType()}, {AllPat});
1255 auto *RDFFR =
1256 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1257 RDFFR->takeName(&II);
1258 return IC.replaceInstUsesWith(II, RDFFR);
1259}
1260
1261static std::optional<Instruction *>
1263 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1264
1265 if (Pattern == AArch64SVEPredPattern::all) {
1266 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1267 auto *VScale = IC.Builder.CreateVScale(StepVal);
1268 VScale->takeName(&II);
1269 return IC.replaceInstUsesWith(II, VScale);
1270 }
1271
1272 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1273
1274 return MinNumElts && NumElts >= MinNumElts
1275 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1276 II, ConstantInt::get(II.getType(), MinNumElts)))
1277 : std::nullopt;
1278}
1279
1280static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1281 IntrinsicInst &II) {
1282 Value *PgVal = II.getArgOperand(0);
1283 Value *OpVal = II.getArgOperand(1);
1284
1285 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1286 // Later optimizations prefer this form.
1287 if (PgVal == OpVal &&
1288 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1289 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1290 Value *Ops[] = {PgVal, OpVal};
1291 Type *Tys[] = {PgVal->getType()};
1292
1293 auto *PTest =
1294 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1295 PTest->takeName(&II);
1296
1297 return IC.replaceInstUsesWith(II, PTest);
1298 }
1299
1300 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1301 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1302
1303 if (!Pg || !Op)
1304 return std::nullopt;
1305
1306 Intrinsic::ID OpIID = Op->getIntrinsicID();
1307
1308 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1309 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1310 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1311 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1312 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1313
1314 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1315
1316 PTest->takeName(&II);
1317 return IC.replaceInstUsesWith(II, PTest);
1318 }
1319
1320 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1321 // Later optimizations may rewrite sequence to use the flag-setting variant
1322 // of instruction X to remove PTEST.
1323 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1324 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1325 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1326 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1327 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1328 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1329 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1330 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1331 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1332 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1333 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1334 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1335 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1336 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1337 Type *Tys[] = {Pg->getType()};
1338
1339 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1340 PTest->takeName(&II);
1341
1342 return IC.replaceInstUsesWith(II, PTest);
1343 }
1344
1345 return std::nullopt;
1346}
1347
1348template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1349static std::optional<Instruction *>
1351 bool MergeIntoAddendOp) {
1352 Value *P = II.getOperand(0);
1353 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1354 if (MergeIntoAddendOp) {
1355 AddendOp = II.getOperand(1);
1356 Mul = II.getOperand(2);
1357 } else {
1358 AddendOp = II.getOperand(2);
1359 Mul = II.getOperand(1);
1360 }
1361
1362 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1363 m_Value(MulOp1))))
1364 return std::nullopt;
1365
1366 if (!Mul->hasOneUse())
1367 return std::nullopt;
1368
1369 Instruction *FMFSource = nullptr;
1370 if (II.getType()->isFPOrFPVectorTy()) {
1371 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1372 // Stop the combine when the flags on the inputs differ in case dropping
1373 // flags would lead to us missing out on more beneficial optimizations.
1374 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1375 return std::nullopt;
1376 if (!FAddFlags.allowContract())
1377 return std::nullopt;
1378 FMFSource = &II;
1379 }
1380
1381 CallInst *Res;
1382 if (MergeIntoAddendOp)
1383 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1384 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1385 else
1386 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1387 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1388
1389 return IC.replaceInstUsesWith(II, Res);
1390}
1391
1392static std::optional<Instruction *>
1394 Value *Pred = II.getOperand(0);
1395 Value *PtrOp = II.getOperand(1);
1396 Type *VecTy = II.getType();
1397
1398 if (isAllActivePredicate(Pred)) {
1399 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1400 Load->copyMetadata(II);
1401 return IC.replaceInstUsesWith(II, Load);
1402 }
1403
1404 CallInst *MaskedLoad =
1405 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1406 Pred, ConstantAggregateZero::get(VecTy));
1407 MaskedLoad->copyMetadata(II);
1408 return IC.replaceInstUsesWith(II, MaskedLoad);
1409}
1410
1411static std::optional<Instruction *>
1413 Value *VecOp = II.getOperand(0);
1414 Value *Pred = II.getOperand(1);
1415 Value *PtrOp = II.getOperand(2);
1416
1417 if (isAllActivePredicate(Pred)) {
1418 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1419 Store->copyMetadata(II);
1420 return IC.eraseInstFromFunction(II);
1421 }
1422
1423 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1424 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1425 MaskedStore->copyMetadata(II);
1426 return IC.eraseInstFromFunction(II);
1427}
1428
1430 switch (Intrinsic) {
1431 case Intrinsic::aarch64_sve_fmul_u:
1432 return Instruction::BinaryOps::FMul;
1433 case Intrinsic::aarch64_sve_fadd_u:
1434 return Instruction::BinaryOps::FAdd;
1435 case Intrinsic::aarch64_sve_fsub_u:
1436 return Instruction::BinaryOps::FSub;
1437 default:
1438 return Instruction::BinaryOpsEnd;
1439 }
1440}
1441
1442static std::optional<Instruction *>
1444 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1445 if (II.isStrictFP())
1446 return std::nullopt;
1447
1448 auto *OpPredicate = II.getOperand(0);
1449 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1450 if (BinOpCode == Instruction::BinaryOpsEnd ||
1451 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1452 m_ConstantInt<AArch64SVEPredPattern::all>())))
1453 return std::nullopt;
1456 auto BinOp =
1457 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1458 return IC.replaceInstUsesWith(II, BinOp);
1459}
1460
1461// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1462// sve.add_u).
1463static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1464 Intrinsic::ID IID) {
1465 auto *OpPredicate = II.getOperand(0);
1466 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1467 m_ConstantInt<AArch64SVEPredPattern::all>())))
1468 return std::nullopt;
1469
1470 auto *Mod = II.getModule();
1471 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1472 II.setCalledFunction(NewDecl);
1473
1474 return &II;
1475}
1476
1477// Simplify operations where predicate has all inactive lanes or try to replace
1478// with _u form when all lanes are active
1479static std::optional<Instruction *>
1481 Intrinsic::ID IID) {
1482 if (match(II.getOperand(0), m_ZeroInt())) {
1483 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1484 // inactive for sv[func]_m
1485 return IC.replaceInstUsesWith(II, II.getOperand(1));
1486 }
1487 return instCombineSVEAllActive(II, IID);
1488}
1489
1490static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1491 IntrinsicInst &II) {
1492 if (auto II_U =
1493 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1494 return II_U;
1495 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1496 Intrinsic::aarch64_sve_mla>(
1497 IC, II, true))
1498 return MLA;
1499 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1500 Intrinsic::aarch64_sve_mad>(
1501 IC, II, false))
1502 return MAD;
1503 return std::nullopt;
1504}
1505
1506static std::optional<Instruction *>
1508 if (auto II_U =
1509 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1510 return II_U;
1511 if (auto FMLA =
1512 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1513 Intrinsic::aarch64_sve_fmla>(IC, II,
1514 true))
1515 return FMLA;
1516 if (auto FMAD =
1517 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1518 Intrinsic::aarch64_sve_fmad>(IC, II,
1519 false))
1520 return FMAD;
1521 if (auto FMLA =
1522 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1523 Intrinsic::aarch64_sve_fmla>(IC, II,
1524 true))
1525 return FMLA;
1526 return std::nullopt;
1527}
1528
1529static std::optional<Instruction *>
1531 if (auto FMLA =
1532 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1533 Intrinsic::aarch64_sve_fmla>(IC, II,
1534 true))
1535 return FMLA;
1536 if (auto FMAD =
1537 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1538 Intrinsic::aarch64_sve_fmad>(IC, II,
1539 false))
1540 return FMAD;
1541 if (auto FMLA_U =
1542 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1543 Intrinsic::aarch64_sve_fmla_u>(
1544 IC, II, true))
1545 return FMLA_U;
1546 return instCombineSVEVectorBinOp(IC, II);
1547}
1548
1549static std::optional<Instruction *>
1551 if (auto II_U =
1552 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1553 return II_U;
1554 if (auto FMLS =
1555 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1556 Intrinsic::aarch64_sve_fmls>(IC, II,
1557 true))
1558 return FMLS;
1559 if (auto FMSB =
1560 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1561 Intrinsic::aarch64_sve_fnmsb>(
1562 IC, II, false))
1563 return FMSB;
1564 if (auto FMLS =
1565 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1566 Intrinsic::aarch64_sve_fmls>(IC, II,
1567 true))
1568 return FMLS;
1569 return std::nullopt;
1570}
1571
1572static std::optional<Instruction *>
1574 if (auto FMLS =
1575 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1576 Intrinsic::aarch64_sve_fmls>(IC, II,
1577 true))
1578 return FMLS;
1579 if (auto FMSB =
1580 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1581 Intrinsic::aarch64_sve_fnmsb>(
1582 IC, II, false))
1583 return FMSB;
1584 if (auto FMLS_U =
1585 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1586 Intrinsic::aarch64_sve_fmls_u>(
1587 IC, II, true))
1588 return FMLS_U;
1589 return instCombineSVEVectorBinOp(IC, II);
1590}
1591
1592static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1593 IntrinsicInst &II) {
1594 if (auto II_U =
1595 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1596 return II_U;
1597 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1598 Intrinsic::aarch64_sve_mls>(
1599 IC, II, true))
1600 return MLS;
1601 return std::nullopt;
1602}
1603
1604static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1605 IntrinsicInst &II,
1606 Intrinsic::ID IID) {
1607 auto *OpPredicate = II.getOperand(0);
1608 auto *OpMultiplicand = II.getOperand(1);
1609 auto *OpMultiplier = II.getOperand(2);
1610
1611 // Return true if a given instruction is a unit splat value, false otherwise.
1612 auto IsUnitSplat = [](auto *I) {
1613 auto *SplatValue = getSplatValue(I);
1614 if (!SplatValue)
1615 return false;
1616 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1617 };
1618
1619 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1620 // with a unit splat value, false otherwise.
1621 auto IsUnitDup = [](auto *I) {
1622 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1623 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1624 return false;
1625
1626 auto *SplatValue = IntrI->getOperand(2);
1627 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1628 };
1629
1630 if (IsUnitSplat(OpMultiplier)) {
1631 // [f]mul pg %n, (dupx 1) => %n
1632 OpMultiplicand->takeName(&II);
1633 return IC.replaceInstUsesWith(II, OpMultiplicand);
1634 } else if (IsUnitDup(OpMultiplier)) {
1635 // [f]mul pg %n, (dup pg 1) => %n
1636 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1637 auto *DupPg = DupInst->getOperand(1);
1638 // TODO: this is naive. The optimization is still valid if DupPg
1639 // 'encompasses' OpPredicate, not only if they're the same predicate.
1640 if (OpPredicate == DupPg) {
1641 OpMultiplicand->takeName(&II);
1642 return IC.replaceInstUsesWith(II, OpMultiplicand);
1643 }
1644 }
1645
1646 return instCombineSVEVectorBinOp(IC, II);
1647}
1648
1649static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1650 IntrinsicInst &II) {
1651 Value *UnpackArg = II.getArgOperand(0);
1652 auto *RetTy = cast<ScalableVectorType>(II.getType());
1653 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1654 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1655
1656 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1657 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1658 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1659 ScalarArg =
1660 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1661 Value *NewVal =
1662 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1663 NewVal->takeName(&II);
1664 return IC.replaceInstUsesWith(II, NewVal);
1665 }
1666
1667 return std::nullopt;
1668}
1669static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1670 IntrinsicInst &II) {
1671 auto *OpVal = II.getOperand(0);
1672 auto *OpIndices = II.getOperand(1);
1673 VectorType *VTy = cast<VectorType>(II.getType());
1674
1675 // Check whether OpIndices is a constant splat value < minimal element count
1676 // of result.
1677 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1678 if (!SplatValue ||
1679 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1680 return std::nullopt;
1681
1682 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1683 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1684 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1685 auto *VectorSplat =
1686 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1687
1688 VectorSplat->takeName(&II);
1689 return IC.replaceInstUsesWith(II, VectorSplat);
1690}
1691
1692static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1693 IntrinsicInst &II) {
1694 Value *A, *B;
1695 Type *RetTy = II.getType();
1696 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1697 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1698
1699 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1700 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1701 if ((match(II.getArgOperand(0),
1702 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1703 match(II.getArgOperand(1),
1704 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1705 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1706 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1707 auto *TyA = cast<ScalableVectorType>(A->getType());
1708 if (TyA == B->getType() &&
1710 auto *SubVec = IC.Builder.CreateInsertVector(
1712 auto *ConcatVec = IC.Builder.CreateInsertVector(
1713 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1714 ConcatVec->takeName(&II);
1715 return IC.replaceInstUsesWith(II, ConcatVec);
1716 }
1717 }
1718
1719 return std::nullopt;
1720}
1721
1722static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1723 IntrinsicInst &II) {
1724 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1725 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1726 Value *A, *B;
1727 if (match(II.getArgOperand(0),
1728 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1729 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1730 m_Specific(A), m_Specific(B))))
1731 return IC.replaceInstUsesWith(
1732 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1733
1734 return std::nullopt;
1735}
1736
1737static std::optional<Instruction *>
1739 Value *Mask = II.getOperand(0);
1740 Value *BasePtr = II.getOperand(1);
1741 Value *Index = II.getOperand(2);
1742 Type *Ty = II.getType();
1743 Value *PassThru = ConstantAggregateZero::get(Ty);
1744
1745 // Contiguous gather => masked load.
1746 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1747 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1748 Value *IndexBase;
1749 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1750 m_Value(IndexBase), m_SpecificInt(1)))) {
1751 Align Alignment =
1752 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1753
1754 Type *VecPtrTy = PointerType::getUnqual(Ty);
1755 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1756 BasePtr, IndexBase);
1757 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1758 CallInst *MaskedLoad =
1759 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1760 MaskedLoad->takeName(&II);
1761 return IC.replaceInstUsesWith(II, MaskedLoad);
1762 }
1763
1764 return std::nullopt;
1765}
1766
1767static std::optional<Instruction *>
1769 Value *Val = II.getOperand(0);
1770 Value *Mask = II.getOperand(1);
1771 Value *BasePtr = II.getOperand(2);
1772 Value *Index = II.getOperand(3);
1773 Type *Ty = Val->getType();
1774
1775 // Contiguous scatter => masked store.
1776 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1777 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1778 Value *IndexBase;
1779 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1780 m_Value(IndexBase), m_SpecificInt(1)))) {
1781 Align Alignment =
1782 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1783
1784 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1785 BasePtr, IndexBase);
1786 Type *VecPtrTy = PointerType::getUnqual(Ty);
1787 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1788
1789 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1790
1791 return IC.eraseInstFromFunction(II);
1792 }
1793
1794 return std::nullopt;
1795}
1796
1797static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1798 IntrinsicInst &II) {
1800 Value *Pred = II.getOperand(0);
1801 Value *Vec = II.getOperand(1);
1802 Value *DivVec = II.getOperand(2);
1803
1804 Value *SplatValue = getSplatValue(DivVec);
1805 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1806 if (!SplatConstantInt)
1807 return std::nullopt;
1808 APInt Divisor = SplatConstantInt->getValue();
1809
1810 if (Divisor.isPowerOf2()) {
1811 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1812 auto ASRD = IC.Builder.CreateIntrinsic(
1813 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1814 return IC.replaceInstUsesWith(II, ASRD);
1815 }
1816 if (Divisor.isNegatedPowerOf2()) {
1817 Divisor.negate();
1818 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1819 auto ASRD = IC.Builder.CreateIntrinsic(
1820 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1821 auto NEG = IC.Builder.CreateIntrinsic(
1822 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1823 return IC.replaceInstUsesWith(II, NEG);
1824 }
1825
1826 return std::nullopt;
1827}
1828
1829bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1830 size_t VecSize = Vec.size();
1831 if (VecSize == 1)
1832 return true;
1833 if (!isPowerOf2_64(VecSize))
1834 return false;
1835 size_t HalfVecSize = VecSize / 2;
1836
1837 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1838 RHS != Vec.end(); LHS++, RHS++) {
1839 if (*LHS != nullptr && *RHS != nullptr) {
1840 if (*LHS == *RHS)
1841 continue;
1842 else
1843 return false;
1844 }
1845 if (!AllowPoison)
1846 return false;
1847 if (*LHS == nullptr && *RHS != nullptr)
1848 *LHS = *RHS;
1849 }
1850
1851 Vec.resize(HalfVecSize);
1852 SimplifyValuePattern(Vec, AllowPoison);
1853 return true;
1854}
1855
1856// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1857// to dupqlane(f64(C)) where C is A concatenated with B
1858static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1859 IntrinsicInst &II) {
1860 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1861 if (!match(II.getOperand(0),
1862 m_Intrinsic<Intrinsic::vector_insert>(
1863 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1864 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1865 return std::nullopt;
1866 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1867
1868 // Insert the scalars into a container ordered by InsertElement index
1869 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1870 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1871 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1872 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1873 CurrentInsertElt = InsertElt->getOperand(0);
1874 }
1875
1876 bool AllowPoison =
1877 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1878 if (!SimplifyValuePattern(Elts, AllowPoison))
1879 return std::nullopt;
1880
1881 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1882 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1883 for (size_t I = 0; I < Elts.size(); I++) {
1884 if (Elts[I] == nullptr)
1885 continue;
1886 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1887 IC.Builder.getInt64(I));
1888 }
1889 if (InsertEltChain == nullptr)
1890 return std::nullopt;
1891
1892 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1893 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1894 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1895 // be narrowed back to the original type.
1896 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1897 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1898 IIScalableTy->getMinNumElements() /
1899 PatternWidth;
1900
1901 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1902 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1903 auto *WideShuffleMaskTy =
1904 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1905
1906 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1907 auto InsertSubvector = IC.Builder.CreateInsertVector(
1908 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1909 auto WideBitcast =
1910 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1911 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1912 auto WideShuffle = IC.Builder.CreateShuffleVector(
1913 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1914 auto NarrowBitcast =
1915 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1916
1917 return IC.replaceInstUsesWith(II, NarrowBitcast);
1918}
1919
1920static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1921 IntrinsicInst &II) {
1922 Value *A = II.getArgOperand(0);
1923 Value *B = II.getArgOperand(1);
1924 if (A == B)
1925 return IC.replaceInstUsesWith(II, A);
1926
1927 return std::nullopt;
1928}
1929
1930static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1931 IntrinsicInst &II) {
1932 Value *Pred = II.getOperand(0);
1933 Value *Vec = II.getOperand(1);
1934 Value *Shift = II.getOperand(2);
1935
1936 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1937 Value *AbsPred, *MergedValue;
1938 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1939 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1940 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1941 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1942
1943 return std::nullopt;
1944
1945 // Transform is valid if any of the following are true:
1946 // * The ABS merge value is an undef or non-negative
1947 // * The ABS predicate is all active
1948 // * The ABS predicate and the SRSHL predicates are the same
1949 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1950 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1951 return std::nullopt;
1952
1953 // Only valid when the shift amount is non-negative, otherwise the rounding
1954 // behaviour of SRSHL cannot be ignored.
1955 if (!match(Shift, m_NonNegative()))
1956 return std::nullopt;
1957
1958 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1959 {II.getType()}, {Pred, Vec, Shift});
1960
1961 return IC.replaceInstUsesWith(II, LSL);
1962}
1963
1964std::optional<Instruction *>
1966 IntrinsicInst &II) const {
1967 Intrinsic::ID IID = II.getIntrinsicID();
1968 switch (IID) {
1969 default:
1970 break;
1971 case Intrinsic::aarch64_neon_fmaxnm:
1972 case Intrinsic::aarch64_neon_fminnm:
1973 return instCombineMaxMinNM(IC, II);
1974 case Intrinsic::aarch64_sve_convert_from_svbool:
1975 return instCombineConvertFromSVBool(IC, II);
1976 case Intrinsic::aarch64_sve_dup:
1977 return instCombineSVEDup(IC, II);
1978 case Intrinsic::aarch64_sve_dup_x:
1979 return instCombineSVEDupX(IC, II);
1980 case Intrinsic::aarch64_sve_cmpne:
1981 case Intrinsic::aarch64_sve_cmpne_wide:
1982 return instCombineSVECmpNE(IC, II);
1983 case Intrinsic::aarch64_sve_rdffr:
1984 return instCombineRDFFR(IC, II);
1985 case Intrinsic::aarch64_sve_lasta:
1986 case Intrinsic::aarch64_sve_lastb:
1987 return instCombineSVELast(IC, II);
1988 case Intrinsic::aarch64_sve_clasta_n:
1989 case Intrinsic::aarch64_sve_clastb_n:
1990 return instCombineSVECondLast(IC, II);
1991 case Intrinsic::aarch64_sve_cntd:
1992 return instCombineSVECntElts(IC, II, 2);
1993 case Intrinsic::aarch64_sve_cntw:
1994 return instCombineSVECntElts(IC, II, 4);
1995 case Intrinsic::aarch64_sve_cnth:
1996 return instCombineSVECntElts(IC, II, 8);
1997 case Intrinsic::aarch64_sve_cntb:
1998 return instCombineSVECntElts(IC, II, 16);
1999 case Intrinsic::aarch64_sve_ptest_any:
2000 case Intrinsic::aarch64_sve_ptest_first:
2001 case Intrinsic::aarch64_sve_ptest_last:
2002 return instCombineSVEPTest(IC, II);
2003 case Intrinsic::aarch64_sve_fabd:
2004 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2005 case Intrinsic::aarch64_sve_fadd:
2006 return instCombineSVEVectorFAdd(IC, II);
2007 case Intrinsic::aarch64_sve_fadd_u:
2008 return instCombineSVEVectorFAddU(IC, II);
2009 case Intrinsic::aarch64_sve_fdiv:
2010 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2011 case Intrinsic::aarch64_sve_fmax:
2012 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2013 case Intrinsic::aarch64_sve_fmaxnm:
2014 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2015 case Intrinsic::aarch64_sve_fmin:
2016 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2017 case Intrinsic::aarch64_sve_fminnm:
2018 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2019 case Intrinsic::aarch64_sve_fmla:
2020 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2021 case Intrinsic::aarch64_sve_fmls:
2022 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2023 case Intrinsic::aarch64_sve_fmul:
2024 if (auto II_U =
2025 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2026 return II_U;
2027 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2028 case Intrinsic::aarch64_sve_fmul_u:
2029 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2030 case Intrinsic::aarch64_sve_fmulx:
2031 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2032 case Intrinsic::aarch64_sve_fnmla:
2033 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2034 case Intrinsic::aarch64_sve_fnmls:
2035 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2036 case Intrinsic::aarch64_sve_fsub:
2037 return instCombineSVEVectorFSub(IC, II);
2038 case Intrinsic::aarch64_sve_fsub_u:
2039 return instCombineSVEVectorFSubU(IC, II);
2040 case Intrinsic::aarch64_sve_add:
2041 return instCombineSVEVectorAdd(IC, II);
2042 case Intrinsic::aarch64_sve_add_u:
2043 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2044 Intrinsic::aarch64_sve_mla_u>(
2045 IC, II, true);
2046 case Intrinsic::aarch64_sve_mla:
2047 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2048 case Intrinsic::aarch64_sve_mls:
2049 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2050 case Intrinsic::aarch64_sve_mul:
2051 if (auto II_U =
2052 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2053 return II_U;
2054 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2055 case Intrinsic::aarch64_sve_mul_u:
2056 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2057 case Intrinsic::aarch64_sve_sabd:
2058 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2059 case Intrinsic::aarch64_sve_smax:
2060 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2061 case Intrinsic::aarch64_sve_smin:
2062 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2063 case Intrinsic::aarch64_sve_smulh:
2064 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2065 case Intrinsic::aarch64_sve_sub:
2066 return instCombineSVEVectorSub(IC, II);
2067 case Intrinsic::aarch64_sve_sub_u:
2068 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2069 Intrinsic::aarch64_sve_mls_u>(
2070 IC, II, true);
2071 case Intrinsic::aarch64_sve_uabd:
2072 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2073 case Intrinsic::aarch64_sve_umax:
2074 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2075 case Intrinsic::aarch64_sve_umin:
2076 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2077 case Intrinsic::aarch64_sve_umulh:
2078 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2079 case Intrinsic::aarch64_sve_asr:
2080 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2081 case Intrinsic::aarch64_sve_lsl:
2082 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2083 case Intrinsic::aarch64_sve_lsr:
2084 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2085 case Intrinsic::aarch64_sve_and:
2086 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2087 case Intrinsic::aarch64_sve_bic:
2088 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2089 case Intrinsic::aarch64_sve_eor:
2090 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2091 case Intrinsic::aarch64_sve_orr:
2092 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2093 case Intrinsic::aarch64_sve_sqsub:
2094 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2095 case Intrinsic::aarch64_sve_uqsub:
2096 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2097 case Intrinsic::aarch64_sve_tbl:
2098 return instCombineSVETBL(IC, II);
2099 case Intrinsic::aarch64_sve_uunpkhi:
2100 case Intrinsic::aarch64_sve_uunpklo:
2101 case Intrinsic::aarch64_sve_sunpkhi:
2102 case Intrinsic::aarch64_sve_sunpklo:
2103 return instCombineSVEUnpack(IC, II);
2104 case Intrinsic::aarch64_sve_uzp1:
2105 return instCombineSVEUzp1(IC, II);
2106 case Intrinsic::aarch64_sve_zip1:
2107 case Intrinsic::aarch64_sve_zip2:
2108 return instCombineSVEZip(IC, II);
2109 case Intrinsic::aarch64_sve_ld1_gather_index:
2110 return instCombineLD1GatherIndex(IC, II);
2111 case Intrinsic::aarch64_sve_st1_scatter_index:
2112 return instCombineST1ScatterIndex(IC, II);
2113 case Intrinsic::aarch64_sve_ld1:
2114 return instCombineSVELD1(IC, II, DL);
2115 case Intrinsic::aarch64_sve_st1:
2116 return instCombineSVEST1(IC, II, DL);
2117 case Intrinsic::aarch64_sve_sdiv:
2118 return instCombineSVESDIV(IC, II);
2119 case Intrinsic::aarch64_sve_sel:
2120 return instCombineSVESel(IC, II);
2121 case Intrinsic::aarch64_sve_srshl:
2122 return instCombineSVESrshl(IC, II);
2123 case Intrinsic::aarch64_sve_dupq_lane:
2124 return instCombineSVEDupqLane(IC, II);
2125 }
2126
2127 return std::nullopt;
2128}
2129
2131 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2132 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2133 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2134 SimplifyAndSetOp) const {
2135 switch (II.getIntrinsicID()) {
2136 default:
2137 break;
2138 case Intrinsic::aarch64_neon_fcvtxn:
2139 case Intrinsic::aarch64_neon_rshrn:
2140 case Intrinsic::aarch64_neon_sqrshrn:
2141 case Intrinsic::aarch64_neon_sqrshrun:
2142 case Intrinsic::aarch64_neon_sqshrn:
2143 case Intrinsic::aarch64_neon_sqshrun:
2144 case Intrinsic::aarch64_neon_sqxtn:
2145 case Intrinsic::aarch64_neon_sqxtun:
2146 case Intrinsic::aarch64_neon_uqrshrn:
2147 case Intrinsic::aarch64_neon_uqshrn:
2148 case Intrinsic::aarch64_neon_uqxtn:
2149 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2150 break;
2151 }
2152
2153 return std::nullopt;
2154}
2155
2158 switch (K) {
2160 return TypeSize::getFixed(64);
2163 return TypeSize::getFixed(0);
2164
2165 if (ST->hasSVE())
2166 return TypeSize::getFixed(
2167 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2168
2169 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
2172 return TypeSize::getScalable(0);
2173
2174 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
2175 }
2176 llvm_unreachable("Unsupported register kind");
2177}
2178
2179bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2181 Type *SrcOverrideTy) {
2182 // A helper that returns a vector type from the given type. The number of
2183 // elements in type Ty determines the vector width.
2184 auto toVectorTy = [&](Type *ArgTy) {
2185 return VectorType::get(ArgTy->getScalarType(),
2186 cast<VectorType>(DstTy)->getElementCount());
2187 };
2188
2189 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2190 // i32, i64]. SVE doesn't generally have the same set of instructions to
2191 // perform an extend with the add/sub/mul. There are SMULLB style
2192 // instructions, but they operate on top/bottom, requiring some sort of lane
2193 // interleaving to be used with zext/sext.
2194 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2195 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2196 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2197 return false;
2198
2199 // Determine if the operation has a widening variant. We consider both the
2200 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2201 // instructions.
2202 //
2203 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2204 // verify that their extending operands are eliminated during code
2205 // generation.
2206 Type *SrcTy = SrcOverrideTy;
2207 switch (Opcode) {
2208 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2209 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2210 // The second operand needs to be an extend
2211 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2212 if (!SrcTy)
2213 SrcTy =
2214 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2215 } else
2216 return false;
2217 break;
2218 case Instruction::Mul: { // SMULL(2), UMULL(2)
2219 // Both operands need to be extends of the same type.
2220 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2221 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2222 if (!SrcTy)
2223 SrcTy =
2224 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2225 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2226 // If one of the operands is a Zext and the other has enough zero bits to
2227 // be treated as unsigned, we can still general a umull, meaning the zext
2228 // is free.
2229 KnownBits Known =
2230 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2231 if (Args[0]->getType()->getScalarSizeInBits() -
2232 Known.Zero.countLeadingOnes() >
2233 DstTy->getScalarSizeInBits() / 2)
2234 return false;
2235 if (!SrcTy)
2236 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2237 DstTy->getScalarSizeInBits() / 2));
2238 } else
2239 return false;
2240 break;
2241 }
2242 default:
2243 return false;
2244 }
2245
2246 // Legalize the destination type and ensure it can be used in a widening
2247 // operation.
2248 auto DstTyL = getTypeLegalizationCost(DstTy);
2249 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2250 return false;
2251
2252 // Legalize the source type and ensure it can be used in a widening
2253 // operation.
2254 assert(SrcTy && "Expected some SrcTy");
2255 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2256 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2257 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2258 return false;
2259
2260 // Get the total number of vector elements in the legalized types.
2261 InstructionCost NumDstEls =
2262 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2263 InstructionCost NumSrcEls =
2264 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2265
2266 // Return true if the legalized types have the same number of vector elements
2267 // and the destination element type size is twice that of the source type.
2268 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2269}
2270
2271// s/urhadd instructions implement the following pattern, making the
2272// extends free:
2273// %x = add ((zext i8 -> i16), 1)
2274// %y = (zext i8 -> i16)
2275// trunc i16 (lshr (add %x, %y), 1) -> i8
2276//
2278 Type *Src) {
2279 // The source should be a legal vector type.
2280 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2281 (Src->isScalableTy() && !ST->hasSVE2()))
2282 return false;
2283
2284 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2285 return false;
2286
2287 // Look for trunc/shl/add before trying to match the pattern.
2288 const Instruction *Add = ExtUser;
2289 auto *AddUser =
2290 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2291 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2292 Add = AddUser;
2293
2294 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2295 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2296 return false;
2297
2298 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2299 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2300 Src->getScalarSizeInBits() !=
2301 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2302 return false;
2303
2304 // Try to match the whole pattern. Ext could be either the first or second
2305 // m_ZExtOrSExt matched.
2306 Instruction *Ex1, *Ex2;
2307 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2308 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2309 return false;
2310
2311 // Ensure both extends are of the same type
2312 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2313 Ex1->getOpcode() == Ex2->getOpcode())
2314 return true;
2315
2316 return false;
2317}
2318
2320 Type *Src,
2323 const Instruction *I) {
2324 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2325 assert(ISD && "Invalid opcode");
2326 // If the cast is observable, and it is used by a widening instruction (e.g.,
2327 // uaddl, saddw, etc.), it may be free.
2328 if (I && I->hasOneUser()) {
2329 auto *SingleUser = cast<Instruction>(*I->user_begin());
2330 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2331 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2332 // For adds only count the second operand as free if both operands are
2333 // extends but not the same operation. (i.e both operands are not free in
2334 // add(sext, zext)).
2335 if (SingleUser->getOpcode() == Instruction::Add) {
2336 if (I == SingleUser->getOperand(1) ||
2337 (isa<CastInst>(SingleUser->getOperand(1)) &&
2338 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2339 return 0;
2340 } else // Others are free so long as isWideningInstruction returned true.
2341 return 0;
2342 }
2343
2344 // The cast will be free for the s/urhadd instructions
2345 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2346 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2347 return 0;
2348 }
2349
2350 // TODO: Allow non-throughput costs that aren't binary.
2351 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2353 return Cost == 0 ? 0 : 1;
2354 return Cost;
2355 };
2356
2357 EVT SrcTy = TLI->getValueType(DL, Src);
2358 EVT DstTy = TLI->getValueType(DL, Dst);
2359
2360 if (!SrcTy.isSimple() || !DstTy.isSimple())
2361 return AdjustCost(
2362 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2363
2364 static const TypeConversionCostTblEntry
2365 ConversionTbl[] = {
2366 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2367 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2368 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2369 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2370 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2371 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2372 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2373 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2374 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2375 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2376 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2377 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2378 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2379 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2380 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2381 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2382 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2383 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2384 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2385 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2386
2387 // Truncations on nxvmiN
2388 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2389 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2390 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2391 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2392 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2393 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2394 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2395 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2396 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2397 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2398 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2399 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2400 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2401 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2402 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2403 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2404
2405 // The number of shll instructions for the extension.
2406 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2407 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2408 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2409 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2410 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2411 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2412 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2413 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2414 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2415 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2416 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2417 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2418 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2419 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2420 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2421 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2422
2423 // LowerVectorINT_TO_FP:
2424 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2425 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2426 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2427 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2428 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2429 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2430
2431 // Complex: to v2f32
2432 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2433 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2434 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2435 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2436 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2437 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2438
2439 // Complex: to v4f32
2440 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2441 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2442 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2443 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2444
2445 // Complex: to v8f32
2446 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2447 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2448 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2449 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2450
2451 // Complex: to v16f32
2452 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2453 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2454
2455 // Complex: to v2f64
2456 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2457 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2458 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2459 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2460 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2461 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2462
2463 // Complex: to v4f64
2464 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2465 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2466
2467 // LowerVectorFP_TO_INT
2468 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2469 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2470 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2471 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2472 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2473 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2474
2475 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2476 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2477 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2478 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2479 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2480 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2481 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2482
2483 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2484 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2485 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2486 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2487 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2488
2489 // Complex, from nxv2f32.
2490 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2491 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2492 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2493 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2494 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2495 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2496 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2497 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2498
2499 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2500 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2501 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2502 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2503 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2504 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2505 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2506
2507 // Complex, from nxv2f64.
2508 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2509 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2510 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2511 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2512 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2513 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2514 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2515 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2516
2517 // Complex, from nxv4f32.
2518 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2519 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2520 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2521 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2522 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2523 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2524 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2525 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2526
2527 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2528 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2529 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2530 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2531 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2532
2533 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2534 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2535 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2536 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2537 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2538 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2539 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2540
2541 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2542 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2543 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2544 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2545 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2546
2547 // Complex, from nxv8f16.
2548 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2549 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2550 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2551 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2552 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2553 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2554 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2555 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2556
2557 // Complex, from nxv4f16.
2558 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2559 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2560 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2561 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2562 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2563 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2564 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2565 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2566
2567 // Complex, from nxv2f16.
2568 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2569 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2570 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2571 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2572 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2573 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2574 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2575 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2576
2577 // Truncate from nxvmf32 to nxvmf16.
2578 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2579 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2580 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2581
2582 // Truncate from nxvmf64 to nxvmf16.
2583 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2584 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2585 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2586
2587 // Truncate from nxvmf64 to nxvmf32.
2588 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2589 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2590 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2591
2592 // Extend from nxvmf16 to nxvmf32.
2593 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2594 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2595 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2596
2597 // Extend from nxvmf16 to nxvmf64.
2598 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2599 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2600 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2601
2602 // Extend from nxvmf32 to nxvmf64.
2603 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2604 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2605 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2606
2607 // Bitcasts from float to integer
2608 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2609 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2610 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2611
2612 // Bitcasts from integer to float
2613 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2614 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2615 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2616
2617 // Add cost for extending to illegal -too wide- scalable vectors.
2618 // zero/sign extend are implemented by multiple unpack operations,
2619 // where each operation has a cost of 1.
2620 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2621 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2622 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2623 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2624 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2625 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2626
2627 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2628 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2629 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2630 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2631 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2632 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2633 };
2634
2635 // We have to estimate a cost of fixed length operation upon
2636 // SVE registers(operations) with the number of registers required
2637 // for a fixed type to be represented upon SVE registers.
2638 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2639 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2640 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2641 ST->useSVEForFixedLengthVectors(WiderTy)) {
2642 std::pair<InstructionCost, MVT> LT =
2643 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2644 unsigned NumElements = AArch64::SVEBitsPerBlock /
2645 LT.second.getVectorElementType().getSizeInBits();
2646 return AdjustCost(
2647 LT.first *
2649 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2650 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2651 CostKind, I));
2652 }
2653
2654 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2655 DstTy.getSimpleVT(),
2656 SrcTy.getSimpleVT()))
2657 return AdjustCost(Entry->Cost);
2658
2659 static const TypeConversionCostTblEntry FP16Tbl[] = {
2660 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2661 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2662 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2663 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2664 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2665 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2666 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2667 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2668 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2669 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2670 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2671 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2672 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2673 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2674 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2675 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2676 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2677 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2678 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2679 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2680 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2681 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2682 };
2683
2684 if (ST->hasFullFP16())
2685 if (const auto *Entry = ConvertCostTableLookup(
2686 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2687 return AdjustCost(Entry->Cost);
2688
2689 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2690 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2691 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2693 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2695 // The standard behaviour in the backend for these cases is to split the
2696 // extend up into two parts:
2697 // 1. Perform an extending load or masked load up to the legal type.
2698 // 2. Extend the loaded data to the final type.
2699 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2700 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2702 Opcode, LegalTy, Src, CCH, CostKind, I);
2704 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2705 return Part1 + Part2;
2706 }
2707
2708 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2709 // but we also want to include the TTI::CastContextHint::Masked case too.
2710 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2711 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2712 TLI->isTypeLegal(DstTy))
2714
2715 return AdjustCost(
2716 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2717}
2718
2720 Type *Dst,
2721 VectorType *VecTy,
2722 unsigned Index) {
2723
2724 // Make sure we were given a valid extend opcode.
2725 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2726 "Invalid opcode");
2727
2728 // We are extending an element we extract from a vector, so the source type
2729 // of the extend is the element type of the vector.
2730 auto *Src = VecTy->getElementType();
2731
2732 // Sign- and zero-extends are for integer types only.
2733 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2734
2735 // Get the cost for the extract. We compute the cost (if any) for the extend
2736 // below.
2738 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2739 CostKind, Index, nullptr, nullptr);
2740
2741 // Legalize the types.
2742 auto VecLT = getTypeLegalizationCost(VecTy);
2743 auto DstVT = TLI->getValueType(DL, Dst);
2744 auto SrcVT = TLI->getValueType(DL, Src);
2745
2746 // If the resulting type is still a vector and the destination type is legal,
2747 // we may get the extension for free. If not, get the default cost for the
2748 // extend.
2749 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2750 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2751 CostKind);
2752
2753 // The destination type should be larger than the element type. If not, get
2754 // the default cost for the extend.
2755 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2756 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2757 CostKind);
2758
2759 switch (Opcode) {
2760 default:
2761 llvm_unreachable("Opcode should be either SExt or ZExt");
2762
2763 // For sign-extends, we only need a smov, which performs the extension
2764 // automatically.
2765 case Instruction::SExt:
2766 return Cost;
2767
2768 // For zero-extends, the extend is performed automatically by a umov unless
2769 // the destination type is i64 and the element type is i8 or i16.
2770 case Instruction::ZExt:
2771 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2772 return Cost;
2773 }
2774
2775 // If we are unable to perform the extend for free, get the default cost.
2776 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2777 CostKind);
2778}
2779
2782 const Instruction *I) {
2784 return Opcode == Instruction::PHI ? 0 : 1;
2785 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2786 // Branches are assumed to be predicted.
2787 return 0;
2788}
2789
2790InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2791 Type *Val,
2792 unsigned Index,
2793 bool HasRealUse) {
2794 assert(Val->isVectorTy() && "This must be a vector type");
2795
2796 if (Index != -1U) {
2797 // Legalize the type.
2798 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2799
2800 // This type is legalized to a scalar type.
2801 if (!LT.second.isVector())
2802 return 0;
2803
2804 // The type may be split. For fixed-width vectors we can normalize the
2805 // index to the new type.
2806 if (LT.second.isFixedLengthVector()) {
2807 unsigned Width = LT.second.getVectorNumElements();
2808 Index = Index % Width;
2809 }
2810
2811 // The element at index zero is already inside the vector.
2812 // - For a physical (HasRealUse==true) insert-element or extract-element
2813 // instruction that extracts integers, an explicit FPR -> GPR move is
2814 // needed. So it has non-zero cost.
2815 // - For the rest of cases (virtual instruction or element type is float),
2816 // consider the instruction free.
2817 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2818 return 0;
2819
2820 // This is recognising a LD1 single-element structure to one lane of one
2821 // register instruction. I.e., if this is an `insertelement` instruction,
2822 // and its second operand is a load, then we will generate a LD1, which
2823 // are expensive instructions.
2824 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2825 return ST->getVectorInsertExtractBaseCost() + 1;
2826
2827 // i1 inserts and extract will include an extra cset or cmp of the vector
2828 // value. Increase the cost by 1 to account.
2829 if (Val->getScalarSizeInBits() == 1)
2830 return ST->getVectorInsertExtractBaseCost() + 1;
2831
2832 // FIXME:
2833 // If the extract-element and insert-element instructions could be
2834 // simplified away (e.g., could be combined into users by looking at use-def
2835 // context), they have no cost. This is not done in the first place for
2836 // compile-time considerations.
2837 }
2838
2839 // All other insert/extracts cost this much.
2840 return ST->getVectorInsertExtractBaseCost();
2841}
2842
2845 unsigned Index, Value *Op0,
2846 Value *Op1) {
2847 bool HasRealUse =
2848 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2849 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2850}
2851
2853 Type *Val,
2855 unsigned Index) {
2856 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2857}
2858
2860 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
2862 if (isa<ScalableVectorType>(Ty))
2864 if (Ty->getElementType()->isFloatingPointTy())
2865 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
2866 CostKind);
2867 return DemandedElts.popcount() * (Insert + Extract) *
2869}
2870
2872 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2875 const Instruction *CxtI) {
2876
2877 // TODO: Handle more cost kinds.
2879 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2880 Op2Info, Args, CxtI);
2881
2882 // Legalize the type.
2883 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2884 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2885
2886 switch (ISD) {
2887 default:
2888 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2889 Op2Info);
2890 case ISD::SDIV:
2891 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2892 // On AArch64, scalar signed division by constants power-of-two are
2893 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2894 // The OperandValue properties many not be same as that of previous
2895 // operation; conservatively assume OP_None.
2897 Instruction::Add, Ty, CostKind,
2898 Op1Info.getNoProps(), Op2Info.getNoProps());
2899 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2900 Op1Info.getNoProps(), Op2Info.getNoProps());
2902 Instruction::Select, Ty, CostKind,
2903 Op1Info.getNoProps(), Op2Info.getNoProps());
2904 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2905 Op1Info.getNoProps(), Op2Info.getNoProps());
2906 return Cost;
2907 }
2908 [[fallthrough]];
2909 case ISD::UDIV: {
2910 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2911 auto VT = TLI->getValueType(DL, Ty);
2912 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2913 // Vector signed division by constant are expanded to the
2914 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2915 // to MULHS + SUB + SRL + ADD + SRL.
2917 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2919 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2921 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2922 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2923 }
2924 }
2925
2927 Opcode, Ty, CostKind, Op1Info, Op2Info);
2928 if (Ty->isVectorTy()) {
2929 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2930 // SDIV/UDIV operations are lowered using SVE, then we can have less
2931 // costs.
2932 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2933 ->getPrimitiveSizeInBits()
2934 .getFixedValue() < 128) {
2935 EVT VT = TLI->getValueType(DL, Ty);
2936 static const CostTblEntry DivTbl[]{
2937 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2938 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2939 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2940 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2941 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2942 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2943
2944 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2945 if (nullptr != Entry)
2946 return Entry->Cost;
2947 }
2948 // For 8/16-bit elements, the cost is higher because the type
2949 // requires promotion and possibly splitting:
2950 if (LT.second.getScalarType() == MVT::i8)
2951 Cost *= 8;
2952 else if (LT.second.getScalarType() == MVT::i16)
2953 Cost *= 4;
2954 return Cost;
2955 } else {
2956 // If one of the operands is a uniform constant then the cost for each
2957 // element is Cost for insertion, extraction and division.
2958 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2959 // operation with scalar type
2960 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2961 (Op2Info.isConstant() && Op2Info.isUniform())) {
2962 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2964 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2965 return (4 + DivCost) * VTy->getNumElements();
2966 }
2967 }
2968 // On AArch64, without SVE, vector divisions are expanded
2969 // into scalar divisions of each pair of elements.
2970 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2971 CostKind, Op1Info, Op2Info);
2972 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2973 Op1Info, Op2Info);
2974 }
2975
2976 // TODO: if one of the arguments is scalar, then it's not necessary to
2977 // double the cost of handling the vector elements.
2978 Cost += Cost;
2979 }
2980 return Cost;
2981 }
2982 case ISD::MUL:
2983 // When SVE is available, then we can lower the v2i64 operation using
2984 // the SVE mul instruction, which has a lower cost.
2985 if (LT.second == MVT::v2i64 && ST->hasSVE())
2986 return LT.first;
2987
2988 // When SVE is not available, there is no MUL.2d instruction,
2989 // which means mul <2 x i64> is expensive as elements are extracted
2990 // from the vectors and the muls scalarized.
2991 // As getScalarizationOverhead is a bit too pessimistic, we
2992 // estimate the cost for a i64 vector directly here, which is:
2993 // - four 2-cost i64 extracts,
2994 // - two 2-cost i64 inserts, and
2995 // - two 1-cost muls.
2996 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2997 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2998 // need to scalarize so the cost can be cheaper (smull or umull).
2999 // so the cost can be cheaper (smull or umull).
3000 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3001 return LT.first;
3002 return LT.first * 14;
3003 case ISD::ADD:
3004 case ISD::XOR:
3005 case ISD::OR:
3006 case ISD::AND:
3007 case ISD::SRL:
3008 case ISD::SRA:
3009 case ISD::SHL:
3010 // These nodes are marked as 'custom' for combining purposes only.
3011 // We know that they are legal. See LowerAdd in ISelLowering.
3012 return LT.first;
3013
3014 case ISD::FNEG:
3015 case ISD::FADD:
3016 case ISD::FSUB:
3017 // Increase the cost for half and bfloat types if not architecturally
3018 // supported.
3019 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3020 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3021 return 2 * LT.first;
3022 if (!Ty->getScalarType()->isFP128Ty())
3023 return LT.first;
3024 [[fallthrough]];
3025 case ISD::FMUL:
3026 case ISD::FDIV:
3027 // These nodes are marked as 'custom' just to lower them to SVE.
3028 // We know said lowering will incur no additional cost.
3029 if (!Ty->getScalarType()->isFP128Ty())
3030 return 2 * LT.first;
3031
3032 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3033 Op2Info);
3034 case ISD::FREM:
3035 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3036 // those functions are not declared in the module.
3037 if (!Ty->isVectorTy())
3038 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3039 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3040 Op2Info);
3041 }
3042}
3043
3045 ScalarEvolution *SE,
3046 const SCEV *Ptr) {
3047 // Address computations in vectorized code with non-consecutive addresses will
3048 // likely result in more instructions compared to scalar code where the
3049 // computation can more often be merged into the index mode. The resulting
3050 // extra micro-ops can significantly decrease throughput.
3051 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3052 int MaxMergeDistance = 64;
3053
3054 if (Ty->isVectorTy() && SE &&
3055 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3056 return NumVectorInstToHideOverhead;
3057
3058 // In many cases the address computation is not merged into the instruction
3059 // addressing mode.
3060 return 1;
3061}
3062
3064 Type *CondTy,
3065 CmpInst::Predicate VecPred,
3067 const Instruction *I) {
3068 // TODO: Handle other cost kinds.
3070 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3071 I);
3072
3073 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3074 // We don't lower some vector selects well that are wider than the register
3075 // width.
3076 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3077 // We would need this many instructions to hide the scalarization happening.
3078 const int AmortizationCost = 20;
3079
3080 // If VecPred is not set, check if we can get a predicate from the context
3081 // instruction, if its type matches the requested ValTy.
3082 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3083 CmpInst::Predicate CurrentPred;
3084 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3085 m_Value())))
3086 VecPred = CurrentPred;
3087 }
3088 // Check if we have a compare/select chain that can be lowered using
3089 // a (F)CMxx & BFI pair.
3090 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3091 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3092 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3093 VecPred == CmpInst::FCMP_UNE) {
3094 static const auto ValidMinMaxTys = {
3095 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3096 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3097 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3098
3099 auto LT = getTypeLegalizationCost(ValTy);
3100 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3101 (ST->hasFullFP16() &&
3102 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3103 return LT.first;
3104 }
3105
3106 static const TypeConversionCostTblEntry
3107 VectorSelectTbl[] = {
3108 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3109 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3110 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3111 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3112 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3113 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3114 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3115 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3116 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3117 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3118 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3119 };
3120
3121 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3122 EVT SelValTy = TLI->getValueType(DL, ValTy);
3123 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3124 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3125 SelCondTy.getSimpleVT(),
3126 SelValTy.getSimpleVT()))
3127 return Entry->Cost;
3128 }
3129 }
3130
3131 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3132 auto LT = getTypeLegalizationCost(ValTy);
3133 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3134 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3135 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3136 }
3137
3138 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3139 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3140 // be profitable.
3141 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3142 ICmpInst::isEquality(VecPred) &&
3143 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3144 match(I->getOperand(1), m_Zero()) &&
3145 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3146 return 0;
3147
3148 // The base case handles scalable vectors fine for now, since it treats the
3149 // cost as 1 * legalization cost.
3150 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3151}
3152
3154AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3156 if (ST->requiresStrictAlign()) {
3157 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3158 // a bunch of instructions when strict align is enabled.
3159 return Options;
3160 }
3161 Options.AllowOverlappingLoads = true;
3162 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3163 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3164 // TODO: Though vector loads usually perform well on AArch64, in some targets
3165 // they may wake up the FP unit, which raises the power consumption. Perhaps
3166 // they could be used with no holds barred (-O3).
3167 Options.LoadSizes = {8, 4, 2, 1};
3168 Options.AllowedTailExpansions = {3, 5, 6};
3169 return Options;
3170}
3171
3173 return ST->hasSVE();
3174}
3175
3178 Align Alignment, unsigned AddressSpace,
3180 if (useNeonVector(Src))
3181 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3182 CostKind);
3183 auto LT = getTypeLegalizationCost(Src);
3184 if (!LT.first.isValid())
3186
3187 // The code-generator is currently not able to handle scalable vectors
3188 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3189 // it. This change will be removed when code-generation for these types is
3190 // sufficiently reliable.
3191 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
3193
3194 return LT.first;
3195}
3196
3197static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3198 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3199}
3200
3202 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3203 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3204 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3205 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3206 Alignment, CostKind, I);
3207 auto *VT = cast<VectorType>(DataTy);
3208 auto LT = getTypeLegalizationCost(DataTy);
3209 if (!LT.first.isValid())
3211
3212 if (!LT.second.isVector() ||
3213 !isElementTypeLegalForScalableVector(VT->getElementType()))
3215
3216 // The code-generator is currently not able to handle scalable vectors
3217 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3218 // it. This change will be removed when code-generation for these types is
3219 // sufficiently reliable.
3220 if (cast<VectorType>(DataTy)->getElementCount() ==
3223
3224 ElementCount LegalVF = LT.second.getVectorElementCount();
3225 InstructionCost MemOpCost =
3226 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3227 {TTI::OK_AnyValue, TTI::OP_None}, I);
3228 // Add on an overhead cost for using gathers/scatters.
3229 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
3230 // point we may want a per-CPU overhead.
3231 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3232 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3233}
3234
3236 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3237}
3238
3240 MaybeAlign Alignment,
3241 unsigned AddressSpace,
3243 TTI::OperandValueInfo OpInfo,
3244 const Instruction *I) {
3245 EVT VT = TLI->getValueType(DL, Ty, true);
3246 // Type legalization can't handle structs
3247 if (VT == MVT::Other)
3248 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3249 CostKind);
3250
3251 auto LT = getTypeLegalizationCost(Ty);
3252 if (!LT.first.isValid())
3254
3255 // The code-generator is currently not able to handle scalable vectors
3256 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3257 // it. This change will be removed when code-generation for these types is
3258 // sufficiently reliable.
3259 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3260 if (VTy->getElementCount() == ElementCount::getScalable(1))
3262
3263 // TODO: consider latency as well for TCK_SizeAndLatency.
3265 return LT.first;
3266
3268 return 1;
3269
3270 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3271 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3272 // Unaligned stores are extremely inefficient. We don't split all
3273 // unaligned 128-bit stores because the negative impact that has shown in
3274 // practice on inlined block copy code.
3275 // We make such stores expensive so that we will only vectorize if there
3276 // are 6 other instructions getting vectorized.
3277 const int AmortizationCost = 6;
3278
3279 return LT.first * 2 * AmortizationCost;
3280 }
3281
3282 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3283 if (Ty->isPtrOrPtrVectorTy())
3284 return LT.first;
3285
3286 if (useNeonVector(Ty)) {
3287 // Check truncating stores and extending loads.
3288 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3289 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3290 if (VT == MVT::v4i8)
3291 return 2;
3292 // Otherwise we need to scalarize.
3293 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3294 }
3295 EVT EltVT = VT.getVectorElementType();
3296 unsigned EltSize = EltVT.getScalarSizeInBits();
3297 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3298 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3299 *Alignment != Align(1))
3300 return LT.first;
3301 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3302 // widening to v4i8, which produces suboptimal results.
3303 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3304 return LT.first;
3305
3306 // Check non-power-of-2 loads/stores for legal vector element types with
3307 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3308 // operations on smaller power-of-2 ops, including ld1/st1.
3309 LLVMContext &C = Ty->getContext();
3311 SmallVector<EVT> TypeWorklist;
3312 TypeWorklist.push_back(VT);
3313 while (!TypeWorklist.empty()) {
3314 EVT CurrVT = TypeWorklist.pop_back_val();
3315 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3316 if (isPowerOf2_32(CurrNumElements)) {
3317 Cost += 1;
3318 continue;
3319 }
3320
3321 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3322 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3323 TypeWorklist.push_back(
3324 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3325 }
3326 return Cost;
3327 }
3328
3329 return LT.first;
3330}
3331
3333 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3334 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3335 bool UseMaskForCond, bool UseMaskForGaps) {
3336 assert(Factor >= 2 && "Invalid interleave factor");
3337 auto *VecVTy = cast<VectorType>(VecTy);
3338
3339 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3341
3342 // Vectorization for masked interleaved accesses is only enabled for scalable
3343 // VF.
3344 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3346
3347 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3348 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3349 auto *SubVecTy =
3350 VectorType::get(VecVTy->getElementType(),
3351 VecVTy->getElementCount().divideCoefficientBy(Factor));
3352
3353 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3354 // Accesses having vector types that are a multiple of 128 bits can be
3355 // matched to more than one ldN/stN instruction.
3356 bool UseScalable;
3357 if (MinElts % Factor == 0 &&
3358 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3359 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3360 }
3361
3362 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3363 Alignment, AddressSpace, CostKind,
3364 UseMaskForCond, UseMaskForGaps);
3365}
3366
3371 for (auto *I : Tys) {
3372 if (!I->isVectorTy())
3373 continue;
3374 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3375 128)
3376 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3377 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3378 }
3379 return Cost;
3380}
3381
3383 return ST->getMaxInterleaveFactor();
3384}
3385
3386// For Falkor, we want to avoid having too many strided loads in a loop since
3387// that can exhaust the HW prefetcher resources. We adjust the unroller
3388// MaxCount preference below to attempt to ensure unrolling doesn't create too
3389// many strided loads.
3390static void
3393 enum { MaxStridedLoads = 7 };
3394 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3395 int StridedLoads = 0;
3396 // FIXME? We could make this more precise by looking at the CFG and
3397 // e.g. not counting loads in each side of an if-then-else diamond.
3398 for (const auto BB : L->blocks()) {
3399 for (auto &I : *BB) {
3400 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3401 if (!LMemI)
3402 continue;
3403
3404 Value *PtrValue = LMemI->getPointerOperand();
3405 if (L->isLoopInvariant(PtrValue))
3406 continue;
3407
3408 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3409 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3410 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3411 continue;
3412
3413 // FIXME? We could take pairing of unrolled load copies into account
3414 // by looking at the AddRec, but we would probably have to limit this
3415 // to loops with no stores or other memory optimization barriers.
3416 ++StridedLoads;
3417 // We've seen enough strided loads that seeing more won't make a
3418 // difference.
3419 if (StridedLoads > MaxStridedLoads / 2)
3420 return StridedLoads;
3421 }
3422 }
3423 return StridedLoads;
3424 };
3425
3426 int StridedLoads = countStridedLoads(L, SE);
3427 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3428 << " strided loads\n");
3429 // Pick the largest power of 2 unroll count that won't result in too many
3430 // strided loads.
3431 if (StridedLoads) {
3432 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3433 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3434 << UP.MaxCount << '\n');
3435 }
3436}
3437
3441 // Enable partial unrolling and runtime unrolling.
3442 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3443
3444 UP.UpperBound = true;
3445
3446 // For inner loop, it is more likely to be a hot one, and the runtime check
3447 // can be promoted out from LICM pass, so the overhead is less, let's try
3448 // a larger threshold to unroll more loops.
3449 if (L->getLoopDepth() > 1)
3450 UP.PartialThreshold *= 2;
3451
3452 // Disable partial & runtime unrolling on -Os.
3454
3455 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3458
3459 // Scan the loop: don't unroll loops with calls as this could prevent
3460 // inlining. Don't unroll vector loops either, as they don't benefit much from
3461 // unrolling.
3462 for (auto *BB : L->getBlocks()) {
3463 for (auto &I : *BB) {
3464 // Don't unroll vectorised loop.
3465 if (I.getType()->isVectorTy())
3466 return;
3467
3468 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3469 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3470 if (!isLoweredToCall(F))
3471 continue;
3472 }
3473 return;
3474 }
3475 }
3476 }
3477
3478 // Enable runtime unrolling for in-order models
3479 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3480 // checking for that case, we can ensure that the default behaviour is
3481 // unchanged
3483 !ST->getSchedModel().isOutOfOrder()) {
3484 UP.Runtime = true;
3485 UP.Partial = true;
3486 UP.UnrollRemainder = true;
3488
3489 UP.UnrollAndJam = true;
3491 }
3492}
3493
3497}
3498
3500 Type *ExpectedType) {
3501 switch (Inst->getIntrinsicID()) {
3502 default:
3503 return nullptr;
3504 case Intrinsic::aarch64_neon_st2:
3505 case Intrinsic::aarch64_neon_st3:
3506 case Intrinsic::aarch64_neon_st4: {
3507 // Create a struct type
3508 StructType *ST = dyn_cast<StructType>(ExpectedType);
3509 if (!ST)
3510 return nullptr;
3511 unsigned NumElts = Inst->arg_size() - 1;
3512 if (ST->getNumElements() != NumElts)
3513 return nullptr;
3514 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3515 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3516 return nullptr;
3517 }
3518 Value *Res = PoisonValue::get(ExpectedType);
3519 IRBuilder<> Builder(Inst);
3520 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3521 Value *L = Inst->getArgOperand(i);
3522 Res = Builder.CreateInsertValue(Res, L, i);
3523 }
3524 return Res;
3525 }
3526 case Intrinsic::aarch64_neon_ld2:
3527 case Intrinsic::aarch64_neon_ld3:
3528 case Intrinsic::aarch64_neon_ld4:
3529 if (Inst->getType() == ExpectedType)
3530 return Inst;
3531 return nullptr;
3532 }
3533}
3534
3536 MemIntrinsicInfo &Info) {
3537 switch (Inst->getIntrinsicID()) {
3538 default:
3539 break;
3540 case Intrinsic::aarch64_neon_ld2:
3541 case Intrinsic::aarch64_neon_ld3:
3542 case Intrinsic::aarch64_neon_ld4:
3543 Info.ReadMem = true;
3544 Info.WriteMem = false;
3545 Info.PtrVal = Inst->getArgOperand(0);
3546 break;
3547 case Intrinsic::aarch64_neon_st2:
3548 case Intrinsic::aarch64_neon_st3:
3549 case Intrinsic::aarch64_neon_st4:
3550 Info.ReadMem = false;
3551 Info.WriteMem = true;
3552 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3553 break;
3554 }
3555
3556 switch (Inst->getIntrinsicID()) {
3557 default:
3558 return false;
3559 case Intrinsic::aarch64_neon_ld2:
3560 case Intrinsic::aarch64_neon_st2:
3561 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3562 break;
3563 case Intrinsic::aarch64_neon_ld3:
3564 case Intrinsic::aarch64_neon_st3:
3565 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3566 break;
3567 case Intrinsic::aarch64_neon_ld4:
3568 case Intrinsic::aarch64_neon_st4:
3569 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3570 break;
3571 }
3572 return true;
3573}
3574
3575/// See if \p I should be considered for address type promotion. We check if \p
3576/// I is a sext with right type and used in memory accesses. If it used in a
3577/// "complex" getelementptr, we allow it to be promoted without finding other
3578/// sext instructions that sign extended the same initial value. A getelementptr
3579/// is considered as "complex" if it has more than 2 operands.
3581 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3582 bool Considerable = false;
3583 AllowPromotionWithoutCommonHeader = false;
3584 if (!isa<SExtInst>(&I))
3585 return false;
3586 Type *ConsideredSExtType =
3587 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3588 if (I.getType() != ConsideredSExtType)
3589 return false;
3590 // See if the sext is the one with the right type and used in at least one
3591 // GetElementPtrInst.
3592 for (const User *U : I.users()) {
3593 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3594 Considerable = true;
3595 // A getelementptr is considered as "complex" if it has more than 2
3596 // operands. We will promote a SExt used in such complex GEP as we
3597 // expect some computation to be merged if they are done on 64 bits.
3598 if (GEPInst->getNumOperands() > 2) {
3599 AllowPromotionWithoutCommonHeader = true;
3600 break;
3601 }
3602 }
3603 }
3604 return Considerable;
3605}
3606
3608 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3609 if (!VF.isScalable())
3610 return true;
3611
3612 Type *Ty = RdxDesc.getRecurrenceType();
3614 return false;
3615
3616 switch (RdxDesc.getRecurrenceKind()) {
3617 case RecurKind::Add:
3618 case RecurKind::FAdd:
3619 case RecurKind::And:
3620 case RecurKind::Or:
3621 case RecurKind::Xor:
3622 case RecurKind::SMin:
3623 case RecurKind::SMax:
3624 case RecurKind::UMin:
3625 case RecurKind::UMax:
3626 case RecurKind::FMin:
3627 case RecurKind::FMax:
3628 case RecurKind::FMulAdd:
3629 case RecurKind::IAnyOf:
3630 case RecurKind::FAnyOf:
3631 return true;
3632 default:
3633 return false;
3634 }
3635}
3636
3639 FastMathFlags FMF,
3641 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3642
3643 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3644 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3645
3646 InstructionCost LegalizationCost = 0;
3647 if (LT.first > 1) {
3648 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3649 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3650 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3651 }
3652
3653 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3654}
3655
3657 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3658 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3659 InstructionCost LegalizationCost = 0;
3660 if (LT.first > 1) {
3661 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3662 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3663 LegalizationCost *= LT.first - 1;
3664 }
3665
3666 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3667 assert(ISD && "Invalid opcode");
3668 // Add the final reduction cost for the legal horizontal reduction
3669 switch (ISD) {
3670 case ISD::ADD:
3671 case ISD::AND:
3672 case ISD::OR:
3673 case ISD::XOR:
3674 case ISD::FADD:
3675 return LegalizationCost + 2;
3676 default:
3678 }
3679}
3680
3683 std::optional<FastMathFlags> FMF,
3686 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3687 InstructionCost BaseCost =
3688 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3689 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3690 // end up vectorizing for more computationally intensive loops.
3691 return BaseCost + FixedVTy->getNumElements();
3692 }
3693
3694 if (Opcode != Instruction::FAdd)
3696
3697 auto *VTy = cast<ScalableVectorType>(ValTy);
3699 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3700 Cost *= getMaxNumElements(VTy->getElementCount());
3701 return Cost;
3702 }
3703
3704 if (isa<ScalableVectorType>(ValTy))
3705 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3706
3707 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3708 MVT MTy = LT.second;
3709 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3710 assert(ISD && "Invalid opcode");
3711
3712 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3713 // instructions as twice a normal vector add, plus 1 for each legalization
3714 // step (LT.first). This is the only arithmetic vector reduction operation for
3715 // which we have an instruction.
3716 // OR, XOR and AND costs should match the codegen from:
3717 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3718 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3719 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3720 static const CostTblEntry CostTblNoPairwise[]{
3721 {ISD::ADD, MVT::v8i8, 2},
3722 {ISD::ADD, MVT::v16i8, 2},
3723 {ISD::ADD, MVT::v4i16, 2},
3724 {ISD::ADD, MVT::v8i16, 2},
3725 {ISD::ADD, MVT::v4i32, 2},
3726 {ISD::ADD, MVT::v2i64, 2},
3727 {ISD::OR, MVT::v8i8, 15},
3728 {ISD::OR, MVT::v16i8, 17},
3729 {ISD::OR, MVT::v4i16, 7},
3730 {ISD::OR, MVT::v8i16, 9},
3731 {ISD::OR, MVT::v2i32, 3},
3732 {ISD::OR, MVT::v4i32, 5},
3733 {ISD::OR, MVT::v2i64, 3},
3734 {ISD::XOR, MVT::v8i8, 15},
3735 {ISD::XOR, MVT::v16i8, 17},
3736 {ISD::XOR, MVT::v4i16, 7},
3737 {ISD::XOR, MVT::v8i16, 9},
3738 {ISD::XOR, MVT::v2i32, 3},
3739 {ISD::XOR, MVT::v4i32, 5},
3740 {ISD::XOR, MVT::v2i64, 3},
3741 {ISD::AND, MVT::v8i8, 15},
3742 {ISD::AND, MVT::v16i8, 17},
3743 {ISD::AND, MVT::v4i16, 7},
3744 {ISD::AND, MVT::v8i16, 9},
3745 {ISD::AND, MVT::v2i32, 3},
3746 {ISD::AND, MVT::v4i32, 5},
3747 {ISD::AND, MVT::v2i64, 3},
3748 };
3749 switch (ISD) {
3750 default:
3751 break;
3752 case ISD::ADD:
3753 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3754 return (LT.first - 1) + Entry->Cost;
3755 break;
3756 case ISD::XOR:
3757 case ISD::AND:
3758 case ISD::OR:
3759 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3760 if (!Entry)
3761 break;
3762 auto *ValVTy = cast<FixedVectorType>(ValTy);
3763 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3764 isPowerOf2_32(ValVTy->getNumElements())) {
3765 InstructionCost ExtraCost = 0;
3766 if (LT.first != 1) {
3767 // Type needs to be split, so there is an extra cost of LT.first - 1
3768 // arithmetic ops.
3769 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3770 MTy.getVectorNumElements());
3771 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3772 ExtraCost *= LT.first - 1;
3773 }
3774 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3775 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3776 return Cost + ExtraCost;
3777 }
3778 break;
3779 }
3780 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3781}
3782
3784 static const CostTblEntry ShuffleTbl[] = {
3785 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3786 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3787 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3788 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3789 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3790 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3791 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3792 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3793 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3794 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3795 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3796 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3797 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3798 };
3799
3800 // The code-generator is currently not able to handle scalable vectors
3801 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3802 // it. This change will be removed when code-generation for these types is
3803 // sufficiently reliable.
3806
3807 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3808 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3810 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3811 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3812 : LT.second;
3813 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3814 InstructionCost LegalizationCost = 0;
3815 if (Index < 0) {
3816 LegalizationCost =
3817 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3819 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3821 }
3822
3823 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3824 // Cost performed on a promoted type.
3825 if (LT.second.getScalarType() == MVT::i1) {
3826 LegalizationCost +=
3827 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3829 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3831 }
3832 const auto *Entry =
3833 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3834 assert(Entry && "Illegal Type for Splice");
3835 LegalizationCost += Entry->Cost;
3836 return LegalizationCost * LT.first;
3837}
3838
3842 ArrayRef<const Value *> Args, const Instruction *CxtI) {
3843 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3844
3845 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3846 // into smaller vectors and sum the cost of each shuffle.
3847 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3848 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3849 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
3850
3851 // Check for LD3/LD4 instructions, which are represented in llvm IR as
3852 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3853 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
3854 // cost than just the load.
3855 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
3858 return std::max<InstructionCost>(1, LT.first / 4);
3859
3860 // Check for ST3/ST4 instructions, which are represented in llvm IR as
3861 // store(interleaving-shuffle). The shuffle cost could potentially be free,
3862 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
3863 // cost than just the store.
3864 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
3866 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
3868 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
3869 return LT.first;
3870
3871 unsigned TpNumElts = Mask.size();
3872 unsigned LTNumElts = LT.second.getVectorNumElements();
3873 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3874 VectorType *NTp =
3875 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3877 for (unsigned N = 0; N < NumVecs; N++) {
3878 SmallVector<int> NMask;
3879 // Split the existing mask into chunks of size LTNumElts. Track the source
3880 // sub-vectors to ensure the result has at most 2 inputs.
3881 unsigned Source1, Source2;
3882 unsigned NumSources = 0;
3883 for (unsigned E = 0; E < LTNumElts; E++) {
3884 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3886 if (MaskElt < 0) {
3888 continue;
3889 }
3890
3891 // Calculate which source from the input this comes from and whether it
3892 // is new to us.
3893 unsigned Source = MaskElt / LTNumElts;
3894 if (NumSources == 0) {
3895 Source1 = Source;
3896 NumSources = 1;
3897 } else if (NumSources == 1 && Source != Source1) {
3898 Source2 = Source;
3899 NumSources = 2;
3900 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3901 NumSources++;
3902 }
3903
3904 // Add to the new mask. For the NumSources>2 case these are not correct,
3905 // but are only used for the modular lane number.
3906 if (Source == Source1)
3907 NMask.push_back(MaskElt % LTNumElts);
3908 else if (Source == Source2)
3909 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3910 else
3911 NMask.push_back(MaskElt % LTNumElts);
3912 }
3913 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3914 // getShuffleCost. If not then cost it using the worst case.
3915 if (NumSources <= 2)
3916 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3918 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
3919 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3920 return ME.value() % LTNumElts == ME.index();
3921 }))
3922 Cost += LTNumElts - 1;
3923 else
3924 Cost += LTNumElts;
3925 }
3926 return Cost;
3927 }
3928
3929 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
3930 // Treat extractsubvector as single op permutation.
3931 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
3932 if (IsExtractSubvector && LT.second.isFixedLengthVector())
3934
3935 // Check for broadcast loads, which are supported by the LD1R instruction.
3936 // In terms of code-size, the shuffle vector is free when a load + dup get
3937 // folded into a LD1R. That's what we check and return here. For performance
3938 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3939 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3940 // that we model the load + dup sequence slightly higher because LD1R is a
3941 // high latency instruction.
3942 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3943 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3944 if (IsLoad && LT.second.isVector() &&
3946 LT.second.getVectorElementCount()))
3947 return 0;
3948 }
3949
3950 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3951 // from the perfect shuffle tables.
3952 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3953 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3954 all_of(Mask, [](int E) { return E < 8; }))
3955 return getPerfectShuffleCost(Mask);
3956
3957 // Check for identity masks, which we can treat as free.
3958 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
3959 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
3960 all_of(enumerate(Mask), [](const auto &M) {
3961 return M.value() < 0 || M.value() == (int)M.index();
3962 }))
3963 return 0;
3964
3965 // Check for other shuffles that are not SK_ kinds but we have native
3966 // instructions for, for example ZIP and UZP.
3967 unsigned Unused;
3968 if (LT.second.isFixedLengthVector() &&
3969 LT.second.getVectorNumElements() == Mask.size() &&
3970 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
3971 (isZIPMask(Mask, LT.second, Unused) ||
3972 isUZPMask(Mask, LT.second, Unused) ||
3973 // Check for non-zero lane splats
3974 all_of(drop_begin(Mask),
3975 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
3976 return 1;
3977
3978 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3979 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3980 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3981 static const CostTblEntry ShuffleTbl[] = {
3982 // Broadcast shuffle kinds can be performed with 'dup'.
3983 {TTI::SK_Broadcast, MVT::v8i8, 1},
3984 {TTI::SK_Broadcast, MVT::v16i8, 1},
3985 {TTI::SK_Broadcast, MVT::v4i16, 1},
3986 {TTI::SK_Broadcast, MVT::v8i16, 1},
3987 {TTI::SK_Broadcast, MVT::v2i32, 1},
3988 {TTI::SK_Broadcast, MVT::v4i32, 1},
3989 {TTI::SK_Broadcast, MVT::v2i64, 1},
3990 {TTI::SK_Broadcast, MVT::v4f16, 1},
3991 {TTI::SK_Broadcast, MVT::v8f16, 1},
3992 {TTI::SK_Broadcast, MVT::v2f32, 1},
3993 {TTI::SK_Broadcast, MVT::v4f32, 1},
3994 {TTI::SK_Broadcast, MVT::v2f64, 1},
3995 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3996 // 'zip1/zip2' instructions.
3997 {TTI::SK_Transpose, MVT::v8i8, 1},
3998 {TTI::SK_Transpose, MVT::v16i8, 1},
3999 {TTI::SK_Transpose, MVT::v4i16, 1},
4000 {TTI::SK_Transpose, MVT::v8i16, 1},
4001 {TTI::SK_Transpose, MVT::v2i32, 1},
4002 {TTI::SK_Transpose, MVT::v4i32, 1},
4003 {TTI::SK_Transpose, MVT::v2i64, 1},
4004 {TTI::SK_Transpose, MVT::v4f16, 1},
4005 {TTI::SK_Transpose, MVT::v8f16, 1},
4006 {TTI::SK_Transpose, MVT::v2f32, 1},
4007 {TTI::SK_Transpose, MVT::v4f32, 1},
4008 {TTI::SK_Transpose, MVT::v2f64, 1},
4009 // Select shuffle kinds.
4010 // TODO: handle vXi8/vXi16.
4011 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4012 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4013 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4014 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4015 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4016 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4017 // PermuteSingleSrc shuffle kinds.
4018 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4019 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4020 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4021 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4022 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4023 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4024 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4025 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4026 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4027 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4028 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4029 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4030 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4031 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4032 // Reverse can be lowered with `rev`.
4033 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4034 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4035 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4036 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4037 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4038 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4039 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4040 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4041 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4042 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4043 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4044 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4045 // Splice can all be lowered as `ext`.
4046 {TTI::SK_Splice, MVT::v2i32, 1},
4047 {TTI::SK_Splice, MVT::v4i32, 1},
4048 {TTI::SK_Splice, MVT::v2i64, 1},
4049 {TTI::SK_Splice, MVT::v2f32, 1},
4050 {TTI::SK_Splice, MVT::v4f32, 1},
4051 {TTI::SK_Splice, MVT::v2f64, 1},
4052 {TTI::SK_Splice, MVT::v8f16, 1},
4053 {TTI::SK_Splice, MVT::v8bf16, 1},
4054 {TTI::SK_Splice, MVT::v8i16, 1},
4055 {TTI::SK_Splice, MVT::v16i8, 1},
4056 {TTI::SK_Splice, MVT::v4bf16, 1},
4057 {TTI::SK_Splice, MVT::v4f16, 1},
4058 {TTI::SK_Splice, MVT::v4i16, 1},
4059 {TTI::SK_Splice, MVT::v8i8, 1},
4060 // Broadcast shuffle kinds for scalable vectors
4061 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4062 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4063 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4064 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4065 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4066 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4067 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4068 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4069 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4070 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4071 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4072 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4073 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4074 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4075 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4076 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4077 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4078 // Handle the cases for vector.reverse with scalable vectors
4079 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4080 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4081 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4082 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4083 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4084 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4085 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4086 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4087 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4088 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4089 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4090 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4091 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4092 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4093 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4094 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4095 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4096 };
4097 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4098 return LT.first * Entry->Cost;
4099 }
4100
4101 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4102 return getSpliceCost(Tp, Index);
4103
4104 // Inserting a subvector can often be done with either a D, S or H register
4105 // move, so long as the inserted vector is "aligned".
4106 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4107 LT.second.getSizeInBits() <= 128 && SubTp) {
4108 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4109 if (SubLT.second.isVector()) {
4110 int NumElts = LT.second.getVectorNumElements();
4111 int NumSubElts = SubLT.second.getVectorNumElements();
4112 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4113 return SubLT.first;
4114 }
4115 }
4116
4117 // Restore optimal kind.
4118 if (IsExtractSubvector)
4120 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4121 CxtI);
4122}
4123
4126 const auto &Strides = DenseMap<Value *, const SCEV *>();
4127 for (BasicBlock *BB : TheLoop->blocks()) {
4128 // Scan the instructions in the block and look for addresses that are
4129 // consecutive and decreasing.
4130 for (Instruction &I : *BB) {
4131 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4133 Type *AccessTy = getLoadStoreType(&I);
4134 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4135 /*ShouldCheckWrap=*/false)
4136 .value_or(0) < 0)
4137 return true;
4138 }
4139 }
4140 }
4141 return false;
4142}
4143
4145 if (!ST->hasSVE())
4146 return false;
4147
4148 // We don't currently support vectorisation with interleaving for SVE - with
4149 // such loops we're better off not using tail-folding. This gives us a chance
4150 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4151 if (TFI->IAI->hasGroups())
4152 return false;
4153
4155 if (TFI->LVL->getReductionVars().size())
4156 Required |= TailFoldingOpts::Reductions;
4157 if (TFI->LVL->getFixedOrderRecurrences().size())
4158 Required |= TailFoldingOpts::Recurrences;
4159
4160 // We call this to discover whether any load/store pointers in the loop have
4161 // negative strides. This will require extra work to reverse the loop
4162 // predicate, which may be expensive.
4165 Required |= TailFoldingOpts::Reverse;
4166 if (Required == TailFoldingOpts::Disabled)
4167 Required |= TailFoldingOpts::Simple;
4168
4170 Required))
4171 return false;
4172
4173 // Don't tail-fold for tight loops where we would be better off interleaving
4174 // with an unpredicated loop.
4175 unsigned NumInsns = 0;
4176 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4177 NumInsns += BB->sizeWithoutDebug();
4178 }
4179
4180 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4181 return NumInsns >= SVETailFoldInsnThreshold;
4182}
4183
4186 int64_t BaseOffset, bool HasBaseReg,
4187 int64_t Scale, unsigned AddrSpace) const {
4188 // Scaling factors are not free at all.
4189 // Operands | Rt Latency
4190 // -------------------------------------------
4191 // Rt, [Xn, Xm] | 4
4192 // -------------------------------------------
4193 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4194 // Rt, [Xn, Wm, <extend> #imm] |
4196 AM.BaseGV = BaseGV;
4197 AM.BaseOffs = BaseOffset;
4198 AM.HasBaseReg = HasBaseReg;
4199 AM.Scale = Scale;
4200 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4201 // Scale represents reg2 * scale, thus account for 1 if
4202 // it is not equal to 0 or 1.
4203 return AM.Scale != 0 && AM.Scale != 1;
4204 return -1;
4205}
4206
4208 // For the binary operators (e.g. or) we need to be more careful than
4209 // selects, here we only transform them if they are already at a natural
4210 // break point in the code - the end of a block with an unconditional
4211 // terminator.
4212 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4213 isa<BranchInst>(I->getNextNode()) &&
4214 cast<BranchInst>(I->getNextNode())->isUnconditional())
4215 return true;
4217}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
IntegerType * Int32Ty
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
unsigned countLeadingOnes() const
Definition: APInt.h:1574
void negate()
Negate this APInt in place.
Definition: APInt.h:1421
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:582
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name, BasicBlock::iterator InsertBefore)
Definition: InstrTypes.h:299
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool isStrictFP() const
Determine if the call requires strict floating point semantics.
Definition: InstrTypes.h:2233
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1781
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
bool isIntPredicate() const
Definition: InstrTypes.h:1123
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1663
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:88
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:559
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1214
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1110
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:544
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:531
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2205
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:554
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr, BasicBlock::iterator InsertBefore)
The core instruction combiner logic.
Definition: InstCombiner.h:47
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:386
BuilderTy & Builder
Definition: InstCombiner.h:60
static InstructionCost getInvalid(CostType Val=0)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:650
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:627
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:686
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:926
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1465
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:933
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:777
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:836
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:521
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:553
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:560
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:882
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:573
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
InstructionCost Cost
@ Default
The result values are uniform if and only if all operands are uniform.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:360
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55