LLVM 19.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61namespace {
62class TailFoldingOption {
63 // These bitfields will only ever be set to something non-zero in operator=,
64 // when setting the -sve-tail-folding option. This option should always be of
65 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
66 // InitialBits is one of (disabled|all|simple). EnableBits represents
67 // additional flags we're enabling, and DisableBits for those flags we're
68 // disabling. The default flag is tracked in the variable NeedsDefault, since
69 // at the time of setting the option we may not know what the default value
70 // for the CPU is.
71 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
72 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
73 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
74
75 // This value needs to be initialised to true in case the user does not
76 // explicitly set the -sve-tail-folding option.
77 bool NeedsDefault = true;
78
79 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
80
81 void setNeedsDefault(bool V) { NeedsDefault = V; }
82
83 void setEnableBit(TailFoldingOpts Bit) {
84 EnableBits |= Bit;
85 DisableBits &= ~Bit;
86 }
87
88 void setDisableBit(TailFoldingOpts Bit) {
89 EnableBits &= ~Bit;
90 DisableBits |= Bit;
91 }
92
93 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
94 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
95
96 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
97 "Initial bits should only include one of "
98 "(disabled|all|simple|default)");
99 Bits = NeedsDefault ? DefaultBits : InitialBits;
100 Bits |= EnableBits;
101 Bits &= ~DisableBits;
102
103 return Bits;
104 }
105
106 void reportError(std::string Opt) {
107 errs() << "invalid argument '" << Opt
108 << "' to -sve-tail-folding=; the option should be of the form\n"
109 " (disabled|all|default|simple)[+(reductions|recurrences"
110 "|reverse|noreductions|norecurrences|noreverse)]\n";
111 report_fatal_error("Unrecognised tail-folding option");
112 }
113
114public:
115
116 void operator=(const std::string &Val) {
117 // If the user explicitly sets -sve-tail-folding= then treat as an error.
118 if (Val.empty()) {
119 reportError("");
120 return;
121 }
122
123 // Since the user is explicitly setting the option we don't automatically
124 // need the default unless they require it.
125 setNeedsDefault(false);
126
127 SmallVector<StringRef, 4> TailFoldTypes;
128 StringRef(Val).split(TailFoldTypes, '+', -1, false);
129
130 unsigned StartIdx = 1;
131 if (TailFoldTypes[0] == "disabled")
132 setInitialBits(TailFoldingOpts::Disabled);
133 else if (TailFoldTypes[0] == "all")
134 setInitialBits(TailFoldingOpts::All);
135 else if (TailFoldTypes[0] == "default")
136 setNeedsDefault(true);
137 else if (TailFoldTypes[0] == "simple")
138 setInitialBits(TailFoldingOpts::Simple);
139 else {
140 StartIdx = 0;
141 setInitialBits(TailFoldingOpts::Disabled);
142 }
143
144 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
145 if (TailFoldTypes[I] == "reductions")
146 setEnableBit(TailFoldingOpts::Reductions);
147 else if (TailFoldTypes[I] == "recurrences")
148 setEnableBit(TailFoldingOpts::Recurrences);
149 else if (TailFoldTypes[I] == "reverse")
150 setEnableBit(TailFoldingOpts::Reverse);
151 else if (TailFoldTypes[I] == "noreductions")
152 setDisableBit(TailFoldingOpts::Reductions);
153 else if (TailFoldTypes[I] == "norecurrences")
154 setDisableBit(TailFoldingOpts::Recurrences);
155 else if (TailFoldTypes[I] == "noreverse")
156 setDisableBit(TailFoldingOpts::Reverse);
157 else
158 reportError(Val);
159 }
160 }
161
162 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
163 return (getBits(DefaultBits) & Required) == Required;
164 }
165};
166} // namespace
167
168TailFoldingOption TailFoldingOptionLoc;
169
171 "sve-tail-folding",
172 cl::desc(
173 "Control the use of vectorisation using tail-folding for SVE where the"
174 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
175 "\ndisabled (Initial) No loop types will vectorize using "
176 "tail-folding"
177 "\ndefault (Initial) Uses the default tail-folding settings for "
178 "the target CPU"
179 "\nall (Initial) All legal loop types will vectorize using "
180 "tail-folding"
181 "\nsimple (Initial) Use tail-folding for simple loops (not "
182 "reductions or recurrences)"
183 "\nreductions Use tail-folding for loops containing reductions"
184 "\nnoreductions Inverse of above"
185 "\nrecurrences Use tail-folding for loops containing fixed order "
186 "recurrences"
187 "\nnorecurrences Inverse of above"
188 "\nreverse Use tail-folding for loops requiring reversed "
189 "predicates"
190 "\nnoreverse Inverse of above"),
192
193// Experimental option that will only be fully functional when the
194// code-generator is changed to use SVE instead of NEON for all fixed-width
195// operations.
197 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
198
199// Experimental option that will only be fully functional when the cost-model
200// and code-generator have been changed to avoid using scalable vector
201// instructions that are not legal in streaming SVE mode.
203 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
204
205static bool isSMEABIRoutineCall(const CallInst &CI) {
206 const auto *F = CI.getCalledFunction();
207 return F && StringSwitch<bool>(F->getName())
208 .Case("__arm_sme_state", true)
209 .Case("__arm_tpidr2_save", true)
210 .Case("__arm_tpidr2_restore", true)
211 .Case("__arm_za_disable", true)
212 .Default(false);
213}
214
215/// Returns true if the function has explicit operations that can only be
216/// lowered using incompatible instructions for the selected mode. This also
217/// returns true if the function F may use or modify ZA state.
219 for (const BasicBlock &BB : *F) {
220 for (const Instruction &I : BB) {
221 // Be conservative for now and assume that any call to inline asm or to
222 // intrinsics could could result in non-streaming ops (e.g. calls to
223 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
224 // all native LLVM instructions can be lowered to compatible instructions.
225 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
226 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
227 isSMEABIRoutineCall(cast<CallInst>(I))))
228 return true;
229 }
230 }
231 return false;
232}
233
235 const Function *Callee) const {
236 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
237
238 // When inlining, we should consider the body of the function, not the
239 // interface.
240 if (CalleeAttrs.hasStreamingBody()) {
241 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
242 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
243 }
244
245 if (CalleeAttrs.isNewZA())
246 return false;
247
248 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
249 CallerAttrs.requiresSMChange(CalleeAttrs)) {
250 if (hasPossibleIncompatibleOps(Callee))
251 return false;
252 }
253
254 const TargetMachine &TM = getTLI()->getTargetMachine();
255
256 const FeatureBitset &CallerBits =
257 TM.getSubtargetImpl(*Caller)->getFeatureBits();
258 const FeatureBitset &CalleeBits =
259 TM.getSubtargetImpl(*Callee)->getFeatureBits();
260
261 // Inline a callee if its target-features are a subset of the callers
262 // target-features.
263 return (CallerBits & CalleeBits) == CalleeBits;
264}
265
267 const Function *Caller, const Function *Callee,
268 const ArrayRef<Type *> &Types) const {
269 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
270 return false;
271
272 // We need to ensure that argument promotion does not attempt to promote
273 // pointers to fixed-length vector types larger than 128 bits like
274 // <8 x float> (and pointers to aggregate types which have such fixed-length
275 // vector type members) into the values of the pointees. Such vector types
276 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
277 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
278 // types can be safely treated as 128-bit NEON types and they cannot be
279 // distinguished in IR.
280 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
281 auto FVTy = dyn_cast<FixedVectorType>(Ty);
282 return FVTy &&
283 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
284 }))
285 return false;
286
287 return true;
288}
289
290unsigned
292 unsigned DefaultCallPenalty) const {
293 // This function calculates a penalty for executing Call in F.
294 //
295 // There are two ways this function can be called:
296 // (1) F:
297 // call from F -> G (the call here is Call)
298 //
299 // For (1), Call.getCaller() == F, so it will always return a high cost if
300 // a streaming-mode change is required (thus promoting the need to inline the
301 // function)
302 //
303 // (2) F:
304 // call from F -> G (the call here is not Call)
305 // G:
306 // call from G -> H (the call here is Call)
307 //
308 // For (2), if after inlining the body of G into F the call to H requires a
309 // streaming-mode change, and the call to G from F would also require a
310 // streaming-mode change, then there is benefit to do the streaming-mode
311 // change only once and avoid inlining of G into F.
312 SMEAttrs FAttrs(*F);
313 SMEAttrs CalleeAttrs(Call);
314 if (FAttrs.requiresSMChange(CalleeAttrs)) {
315 if (F == Call.getCaller()) // (1)
316 return CallPenaltyChangeSM * DefaultCallPenalty;
317 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
318 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
319 }
320
321 return DefaultCallPenalty;
322}
323
328 ST->isNeonAvailable());
329}
330
331/// Calculate the cost of materializing a 64-bit value. This helper
332/// method might only calculate a fraction of a larger immediate. Therefore it
333/// is valid to return a cost of ZERO.
335 // Check if the immediate can be encoded within an instruction.
336 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
337 return 0;
338
339 if (Val < 0)
340 Val = ~Val;
341
342 // Calculate how many moves we will need to materialize this constant.
345 return Insn.size();
346}
347
348/// Calculate the cost of materializing the given constant.
351 assert(Ty->isIntegerTy());
352
353 unsigned BitSize = Ty->getPrimitiveSizeInBits();
354 if (BitSize == 0)
355 return ~0U;
356
357 // Sign-extend all constants to a multiple of 64-bit.
358 APInt ImmVal = Imm;
359 if (BitSize & 0x3f)
360 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
361
362 // Split the constant into 64-bit chunks and calculate the cost for each
363 // chunk.
365 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
366 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
367 int64_t Val = Tmp.getSExtValue();
368 Cost += getIntImmCost(Val);
369 }
370 // We need at least one instruction to materialze the constant.
371 return std::max<InstructionCost>(1, Cost);
372}
373
375 const APInt &Imm, Type *Ty,
377 Instruction *Inst) {
378 assert(Ty->isIntegerTy());
379
380 unsigned BitSize = Ty->getPrimitiveSizeInBits();
381 // There is no cost model for constants with a bit size of 0. Return TCC_Free
382 // here, so that constant hoisting will ignore this constant.
383 if (BitSize == 0)
384 return TTI::TCC_Free;
385
386 unsigned ImmIdx = ~0U;
387 switch (Opcode) {
388 default:
389 return TTI::TCC_Free;
390 case Instruction::GetElementPtr:
391 // Always hoist the base address of a GetElementPtr.
392 if (Idx == 0)
393 return 2 * TTI::TCC_Basic;
394 return TTI::TCC_Free;
395 case Instruction::Store:
396 ImmIdx = 0;
397 break;
398 case Instruction::Add:
399 case Instruction::Sub:
400 case Instruction::Mul:
401 case Instruction::UDiv:
402 case Instruction::SDiv:
403 case Instruction::URem:
404 case Instruction::SRem:
405 case Instruction::And:
406 case Instruction::Or:
407 case Instruction::Xor:
408 case Instruction::ICmp:
409 ImmIdx = 1;
410 break;
411 // Always return TCC_Free for the shift value of a shift instruction.
412 case Instruction::Shl:
413 case Instruction::LShr:
414 case Instruction::AShr:
415 if (Idx == 1)
416 return TTI::TCC_Free;
417 break;
418 case Instruction::Trunc:
419 case Instruction::ZExt:
420 case Instruction::SExt:
421 case Instruction::IntToPtr:
422 case Instruction::PtrToInt:
423 case Instruction::BitCast:
424 case Instruction::PHI:
425 case Instruction::Call:
426 case Instruction::Select:
427 case Instruction::Ret:
428 case Instruction::Load:
429 break;
430 }
431
432 if (Idx == ImmIdx) {
433 int NumConstants = (BitSize + 63) / 64;
435 return (Cost <= NumConstants * TTI::TCC_Basic)
436 ? static_cast<int>(TTI::TCC_Free)
437 : Cost;
438 }
440}
441
444 const APInt &Imm, Type *Ty,
446 assert(Ty->isIntegerTy());
447
448 unsigned BitSize = Ty->getPrimitiveSizeInBits();
449 // There is no cost model for constants with a bit size of 0. Return TCC_Free
450 // here, so that constant hoisting will ignore this constant.
451 if (BitSize == 0)
452 return TTI::TCC_Free;
453
454 // Most (all?) AArch64 intrinsics do not support folding immediates into the
455 // selected instruction, so we compute the materialization cost for the
456 // immediate directly.
457 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
459
460 switch (IID) {
461 default:
462 return TTI::TCC_Free;
463 case Intrinsic::sadd_with_overflow:
464 case Intrinsic::uadd_with_overflow:
465 case Intrinsic::ssub_with_overflow:
466 case Intrinsic::usub_with_overflow:
467 case Intrinsic::smul_with_overflow:
468 case Intrinsic::umul_with_overflow:
469 if (Idx == 1) {
470 int NumConstants = (BitSize + 63) / 64;
472 return (Cost <= NumConstants * TTI::TCC_Basic)
473 ? static_cast<int>(TTI::TCC_Free)
474 : Cost;
475 }
476 break;
477 case Intrinsic::experimental_stackmap:
478 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
479 return TTI::TCC_Free;
480 break;
481 case Intrinsic::experimental_patchpoint_void:
482 case Intrinsic::experimental_patchpoint_i64:
483 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
484 return TTI::TCC_Free;
485 break;
486 case Intrinsic::experimental_gc_statepoint:
487 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
488 return TTI::TCC_Free;
489 break;
490 }
492}
493
496 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
497 if (TyWidth == 32 || TyWidth == 64)
499 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
500 return TTI::PSK_Software;
501}
502
506 auto *RetTy = ICA.getReturnType();
507 switch (ICA.getID()) {
508 case Intrinsic::umin:
509 case Intrinsic::umax:
510 case Intrinsic::smin:
511 case Intrinsic::smax: {
512 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
513 MVT::v8i16, MVT::v2i32, MVT::v4i32,
514 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
515 MVT::nxv2i64};
517 // v2i64 types get converted to cmp+bif hence the cost of 2
518 if (LT.second == MVT::v2i64)
519 return LT.first * 2;
520 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
521 return LT.first;
522 break;
523 }
524 case Intrinsic::sadd_sat:
525 case Intrinsic::ssub_sat:
526 case Intrinsic::uadd_sat:
527 case Intrinsic::usub_sat: {
528 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
529 MVT::v8i16, MVT::v2i32, MVT::v4i32,
530 MVT::v2i64};
532 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
533 // need to extend the type, as it uses shr(qadd(shl, shl)).
534 unsigned Instrs =
535 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
536 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
537 return LT.first * Instrs;
538 break;
539 }
540 case Intrinsic::abs: {
541 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
542 MVT::v8i16, MVT::v2i32, MVT::v4i32,
543 MVT::v2i64};
545 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
546 return LT.first;
547 break;
548 }
549 case Intrinsic::bswap: {
550 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
551 MVT::v4i32, MVT::v2i64};
553 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
554 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
555 return LT.first;
556 break;
557 }
558 case Intrinsic::experimental_stepvector: {
559 InstructionCost Cost = 1; // Cost of the `index' instruction
561 // Legalisation of illegal vectors involves an `index' instruction plus
562 // (LT.first - 1) vector adds.
563 if (LT.first > 1) {
564 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
565 InstructionCost AddCost =
566 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
567 Cost += AddCost * (LT.first - 1);
568 }
569 return Cost;
570 }
571 case Intrinsic::bitreverse: {
572 static const CostTblEntry BitreverseTbl[] = {
573 {Intrinsic::bitreverse, MVT::i32, 1},
574 {Intrinsic::bitreverse, MVT::i64, 1},
575 {Intrinsic::bitreverse, MVT::v8i8, 1},
576 {Intrinsic::bitreverse, MVT::v16i8, 1},
577 {Intrinsic::bitreverse, MVT::v4i16, 2},
578 {Intrinsic::bitreverse, MVT::v8i16, 2},
579 {Intrinsic::bitreverse, MVT::v2i32, 2},
580 {Intrinsic::bitreverse, MVT::v4i32, 2},
581 {Intrinsic::bitreverse, MVT::v1i64, 2},
582 {Intrinsic::bitreverse, MVT::v2i64, 2},
583 };
584 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
585 const auto *Entry =
586 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
587 if (Entry) {
588 // Cost Model is using the legal type(i32) that i8 and i16 will be
589 // converted to +1 so that we match the actual lowering cost
590 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
591 TLI->getValueType(DL, RetTy, true) == MVT::i16)
592 return LegalisationCost.first * Entry->Cost + 1;
593
594 return LegalisationCost.first * Entry->Cost;
595 }
596 break;
597 }
598 case Intrinsic::ctpop: {
599 if (!ST->hasNEON()) {
600 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
601 return getTypeLegalizationCost(RetTy).first * 12;
602 }
603 static const CostTblEntry CtpopCostTbl[] = {
604 {ISD::CTPOP, MVT::v2i64, 4},
605 {ISD::CTPOP, MVT::v4i32, 3},
606 {ISD::CTPOP, MVT::v8i16, 2},
607 {ISD::CTPOP, MVT::v16i8, 1},
608 {ISD::CTPOP, MVT::i64, 4},
609 {ISD::CTPOP, MVT::v2i32, 3},
610 {ISD::CTPOP, MVT::v4i16, 2},
611 {ISD::CTPOP, MVT::v8i8, 1},
612 {ISD::CTPOP, MVT::i32, 5},
613 };
615 MVT MTy = LT.second;
616 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
617 // Extra cost of +1 when illegal vector types are legalized by promoting
618 // the integer type.
619 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
620 RetTy->getScalarSizeInBits()
621 ? 1
622 : 0;
623 return LT.first * Entry->Cost + ExtraCost;
624 }
625 break;
626 }
627 case Intrinsic::sadd_with_overflow:
628 case Intrinsic::uadd_with_overflow:
629 case Intrinsic::ssub_with_overflow:
630 case Intrinsic::usub_with_overflow:
631 case Intrinsic::smul_with_overflow:
632 case Intrinsic::umul_with_overflow: {
633 static const CostTblEntry WithOverflowCostTbl[] = {
634 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
635 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
636 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
637 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
638 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
639 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
640 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
641 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
642 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
643 {Intrinsic::usub_with_overflow, MVT::i8, 3},
644 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
645 {Intrinsic::usub_with_overflow, MVT::i16, 3},
646 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
647 {Intrinsic::usub_with_overflow, MVT::i32, 1},
648 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
649 {Intrinsic::usub_with_overflow, MVT::i64, 1},
650 {Intrinsic::smul_with_overflow, MVT::i8, 5},
651 {Intrinsic::umul_with_overflow, MVT::i8, 4},
652 {Intrinsic::smul_with_overflow, MVT::i16, 5},
653 {Intrinsic::umul_with_overflow, MVT::i16, 4},
654 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
655 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
656 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
657 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
658 };
659 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
660 if (MTy.isSimple())
661 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
662 MTy.getSimpleVT()))
663 return Entry->Cost;
664 break;
665 }
666 case Intrinsic::fptosi_sat:
667 case Intrinsic::fptoui_sat: {
668 if (ICA.getArgTypes().empty())
669 break;
670 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
671 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
672 EVT MTy = TLI->getValueType(DL, RetTy);
673 // Check for the legal types, which are where the size of the input and the
674 // output are the same, or we are using cvt f64->i32 or f32->i64.
675 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
676 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
677 LT.second == MVT::v2f64) &&
678 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
679 (LT.second == MVT::f64 && MTy == MVT::i32) ||
680 (LT.second == MVT::f32 && MTy == MVT::i64)))
681 return LT.first;
682 // Similarly for fp16 sizes
683 if (ST->hasFullFP16() &&
684 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
685 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
686 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
687 return LT.first;
688
689 // Otherwise we use a legal convert followed by a min+max
690 if ((LT.second.getScalarType() == MVT::f32 ||
691 LT.second.getScalarType() == MVT::f64 ||
692 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
693 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
694 Type *LegalTy =
695 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
696 if (LT.second.isVector())
697 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
699 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
700 LegalTy, {LegalTy, LegalTy});
702 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
703 LegalTy, {LegalTy, LegalTy});
705 return LT.first * Cost;
706 }
707 break;
708 }
709 case Intrinsic::fshl:
710 case Intrinsic::fshr: {
711 if (ICA.getArgs().empty())
712 break;
713
714 // TODO: Add handling for fshl where third argument is not a constant.
715 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
716 if (!OpInfoZ.isConstant())
717 break;
718
719 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
720 if (OpInfoZ.isUniform()) {
721 // FIXME: The costs could be lower if the codegen is better.
722 static const CostTblEntry FshlTbl[] = {
723 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
724 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
725 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
726 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
727 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
728 // to avoid having to duplicate the costs.
729 const auto *Entry =
730 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
731 if (Entry)
732 return LegalisationCost.first * Entry->Cost;
733 }
734
735 auto TyL = getTypeLegalizationCost(RetTy);
736 if (!RetTy->isIntegerTy())
737 break;
738
739 // Estimate cost manually, as types like i8 and i16 will get promoted to
740 // i32 and CostTableLookup will ignore the extra conversion cost.
741 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
742 RetTy->getScalarSizeInBits() < 64) ||
743 (RetTy->getScalarSizeInBits() % 64 != 0);
744 unsigned ExtraCost = HigherCost ? 1 : 0;
745 if (RetTy->getScalarSizeInBits() == 32 ||
746 RetTy->getScalarSizeInBits() == 64)
747 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
748 // extr instruction.
749 else if (HigherCost)
750 ExtraCost = 1;
751 else
752 break;
753 return TyL.first + ExtraCost;
754 }
755 default:
756 break;
757 }
759}
760
761/// The function will remove redundant reinterprets casting in the presence
762/// of the control flow
763static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
764 IntrinsicInst &II) {
766 auto RequiredType = II.getType();
767
768 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
769 assert(PN && "Expected Phi Node!");
770
771 // Don't create a new Phi unless we can remove the old one.
772 if (!PN->hasOneUse())
773 return std::nullopt;
774
775 for (Value *IncValPhi : PN->incoming_values()) {
776 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
777 if (!Reinterpret ||
778 Reinterpret->getIntrinsicID() !=
779 Intrinsic::aarch64_sve_convert_to_svbool ||
780 RequiredType != Reinterpret->getArgOperand(0)->getType())
781 return std::nullopt;
782 }
783
784 // Create the new Phi
785 IC.Builder.SetInsertPoint(PN);
786 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
787 Worklist.push_back(PN);
788
789 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
790 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
791 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
792 Worklist.push_back(Reinterpret);
793 }
794
795 // Cleanup Phi Node and reinterprets
796 return IC.replaceInstUsesWith(II, NPN);
797}
798
799// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
800// => (binop (pred) (from_svbool _) (from_svbool _))
801//
802// The above transformation eliminates a `to_svbool` in the predicate
803// operand of bitwise operation `binop` by narrowing the vector width of
804// the operation. For example, it would convert a `<vscale x 16 x i1>
805// and` into a `<vscale x 4 x i1> and`. This is profitable because
806// to_svbool must zero the new lanes during widening, whereas
807// from_svbool is free.
808static std::optional<Instruction *>
810 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
811 if (!BinOp)
812 return std::nullopt;
813
814 auto IntrinsicID = BinOp->getIntrinsicID();
815 switch (IntrinsicID) {
816 case Intrinsic::aarch64_sve_and_z:
817 case Intrinsic::aarch64_sve_bic_z:
818 case Intrinsic::aarch64_sve_eor_z:
819 case Intrinsic::aarch64_sve_nand_z:
820 case Intrinsic::aarch64_sve_nor_z:
821 case Intrinsic::aarch64_sve_orn_z:
822 case Intrinsic::aarch64_sve_orr_z:
823 break;
824 default:
825 return std::nullopt;
826 }
827
828 auto BinOpPred = BinOp->getOperand(0);
829 auto BinOpOp1 = BinOp->getOperand(1);
830 auto BinOpOp2 = BinOp->getOperand(2);
831
832 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
833 if (!PredIntr ||
834 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
835 return std::nullopt;
836
837 auto PredOp = PredIntr->getOperand(0);
838 auto PredOpTy = cast<VectorType>(PredOp->getType());
839 if (PredOpTy != II.getType())
840 return std::nullopt;
841
842 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
843 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
844 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
845 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
846 if (BinOpOp1 == BinOpOp2)
847 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
848 else
849 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
850 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
851
852 auto NarrowedBinOp =
853 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
854 return IC.replaceInstUsesWith(II, NarrowedBinOp);
855}
856
857static std::optional<Instruction *>
859 // If the reinterpret instruction operand is a PHI Node
860 if (isa<PHINode>(II.getArgOperand(0)))
861 return processPhiNode(IC, II);
862
863 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
864 return BinOpCombine;
865
866 // Ignore converts to/from svcount_t.
867 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
868 isa<TargetExtType>(II.getType()))
869 return std::nullopt;
870
871 SmallVector<Instruction *, 32> CandidatesForRemoval;
872 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
873
874 const auto *IVTy = cast<VectorType>(II.getType());
875
876 // Walk the chain of conversions.
877 while (Cursor) {
878 // If the type of the cursor has fewer lanes than the final result, zeroing
879 // must take place, which breaks the equivalence chain.
880 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
881 if (CursorVTy->getElementCount().getKnownMinValue() <
882 IVTy->getElementCount().getKnownMinValue())
883 break;
884
885 // If the cursor has the same type as I, it is a viable replacement.
886 if (Cursor->getType() == IVTy)
887 EarliestReplacement = Cursor;
888
889 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
890
891 // If this is not an SVE conversion intrinsic, this is the end of the chain.
892 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
893 Intrinsic::aarch64_sve_convert_to_svbool ||
894 IntrinsicCursor->getIntrinsicID() ==
895 Intrinsic::aarch64_sve_convert_from_svbool))
896 break;
897
898 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
899 Cursor = IntrinsicCursor->getOperand(0);
900 }
901
902 // If no viable replacement in the conversion chain was found, there is
903 // nothing to do.
904 if (!EarliestReplacement)
905 return std::nullopt;
906
907 return IC.replaceInstUsesWith(II, EarliestReplacement);
908}
909
910static bool isAllActivePredicate(Value *Pred) {
911 // Look through convert.from.svbool(convert.to.svbool(...) chain.
912 Value *UncastedPred;
913 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
914 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
915 m_Value(UncastedPred)))))
916 // If the predicate has the same or less lanes than the uncasted
917 // predicate then we know the casting has no effect.
918 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
919 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
920 Pred = UncastedPred;
921
922 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
923 m_ConstantInt<AArch64SVEPredPattern::all>()));
924}
925
926static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
927 IntrinsicInst &II) {
928 // svsel(ptrue, x, y) => x
929 auto *OpPredicate = II.getOperand(0);
930 if (isAllActivePredicate(OpPredicate))
931 return IC.replaceInstUsesWith(II, II.getOperand(1));
932
933 auto Select =
934 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
935 return IC.replaceInstUsesWith(II, Select);
936}
937
938static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
939 IntrinsicInst &II) {
940 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
941 if (!Pg)
942 return std::nullopt;
943
944 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
945 return std::nullopt;
946
947 const auto PTruePattern =
948 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
949 if (PTruePattern != AArch64SVEPredPattern::vl1)
950 return std::nullopt;
951
952 // The intrinsic is inserting into lane zero so use an insert instead.
953 auto *IdxTy = Type::getInt64Ty(II.getContext());
954 auto *Insert = InsertElementInst::Create(
955 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
956 Insert->insertBefore(&II);
957 Insert->takeName(&II);
958
959 return IC.replaceInstUsesWith(II, Insert);
960}
961
962static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
963 IntrinsicInst &II) {
964 // Replace DupX with a regular IR splat.
965 auto *RetTy = cast<ScalableVectorType>(II.getType());
966 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
967 II.getArgOperand(0));
968 Splat->takeName(&II);
969 return IC.replaceInstUsesWith(II, Splat);
970}
971
972static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
973 IntrinsicInst &II) {
974 LLVMContext &Ctx = II.getContext();
975
976 // Check that the predicate is all active
977 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
978 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
979 return std::nullopt;
980
981 const auto PTruePattern =
982 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
983 if (PTruePattern != AArch64SVEPredPattern::all)
984 return std::nullopt;
985
986 // Check that we have a compare of zero..
987 auto *SplatValue =
988 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
989 if (!SplatValue || !SplatValue->isZero())
990 return std::nullopt;
991
992 // ..against a dupq
993 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
994 if (!DupQLane ||
995 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
996 return std::nullopt;
997
998 // Where the dupq is a lane 0 replicate of a vector insert
999 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1000 return std::nullopt;
1001
1002 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1003 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1004 return std::nullopt;
1005
1006 // Where the vector insert is a fixed constant vector insert into undef at
1007 // index zero
1008 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1009 return std::nullopt;
1010
1011 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1012 return std::nullopt;
1013
1014 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1015 if (!ConstVec)
1016 return std::nullopt;
1017
1018 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1019 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1020 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1021 return std::nullopt;
1022
1023 unsigned NumElts = VecTy->getNumElements();
1024 unsigned PredicateBits = 0;
1025
1026 // Expand intrinsic operands to a 16-bit byte level predicate
1027 for (unsigned I = 0; I < NumElts; ++I) {
1028 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1029 if (!Arg)
1030 return std::nullopt;
1031 if (!Arg->isZero())
1032 PredicateBits |= 1 << (I * (16 / NumElts));
1033 }
1034
1035 // If all bits are zero bail early with an empty predicate
1036 if (PredicateBits == 0) {
1037 auto *PFalse = Constant::getNullValue(II.getType());
1038 PFalse->takeName(&II);
1039 return IC.replaceInstUsesWith(II, PFalse);
1040 }
1041
1042 // Calculate largest predicate type used (where byte predicate is largest)
1043 unsigned Mask = 8;
1044 for (unsigned I = 0; I < 16; ++I)
1045 if ((PredicateBits & (1 << I)) != 0)
1046 Mask |= (I % 8);
1047
1048 unsigned PredSize = Mask & -Mask;
1049 auto *PredType = ScalableVectorType::get(
1050 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1051
1052 // Ensure all relevant bits are set
1053 for (unsigned I = 0; I < 16; I += PredSize)
1054 if ((PredicateBits & (1 << I)) == 0)
1055 return std::nullopt;
1056
1057 auto *PTruePat =
1058 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1059 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1060 {PredType}, {PTruePat});
1061 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1062 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1063 auto *ConvertFromSVBool =
1064 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1065 {II.getType()}, {ConvertToSVBool});
1066
1067 ConvertFromSVBool->takeName(&II);
1068 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1069}
1070
1071static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1072 IntrinsicInst &II) {
1073 Value *Pg = II.getArgOperand(0);
1074 Value *Vec = II.getArgOperand(1);
1075 auto IntrinsicID = II.getIntrinsicID();
1076 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1077
1078 // lastX(splat(X)) --> X
1079 if (auto *SplatVal = getSplatValue(Vec))
1080 return IC.replaceInstUsesWith(II, SplatVal);
1081
1082 // If x and/or y is a splat value then:
1083 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1084 Value *LHS, *RHS;
1085 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1086 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1087 auto *OldBinOp = cast<BinaryOperator>(Vec);
1088 auto OpC = OldBinOp->getOpcode();
1089 auto *NewLHS =
1090 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1091 auto *NewRHS =
1092 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1094 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
1095 return IC.replaceInstUsesWith(II, NewBinOp);
1096 }
1097 }
1098
1099 auto *C = dyn_cast<Constant>(Pg);
1100 if (IsAfter && C && C->isNullValue()) {
1101 // The intrinsic is extracting lane 0 so use an extract instead.
1102 auto *IdxTy = Type::getInt64Ty(II.getContext());
1103 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1104 Extract->insertBefore(&II);
1105 Extract->takeName(&II);
1106 return IC.replaceInstUsesWith(II, Extract);
1107 }
1108
1109 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1110 if (!IntrPG)
1111 return std::nullopt;
1112
1113 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1114 return std::nullopt;
1115
1116 const auto PTruePattern =
1117 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1118
1119 // Can the intrinsic's predicate be converted to a known constant index?
1120 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1121 if (!MinNumElts)
1122 return std::nullopt;
1123
1124 unsigned Idx = MinNumElts - 1;
1125 // Increment the index if extracting the element after the last active
1126 // predicate element.
1127 if (IsAfter)
1128 ++Idx;
1129
1130 // Ignore extracts whose index is larger than the known minimum vector
1131 // length. NOTE: This is an artificial constraint where we prefer to
1132 // maintain what the user asked for until an alternative is proven faster.
1133 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1134 if (Idx >= PgVTy->getMinNumElements())
1135 return std::nullopt;
1136
1137 // The intrinsic is extracting a fixed lane so use an extract instead.
1138 auto *IdxTy = Type::getInt64Ty(II.getContext());
1139 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1140 Extract->insertBefore(&II);
1141 Extract->takeName(&II);
1142 return IC.replaceInstUsesWith(II, Extract);
1143}
1144
1145static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1146 IntrinsicInst &II) {
1147 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1148 // integer variant across a variety of micro-architectures. Replace scalar
1149 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1150 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1151 // depending on the micro-architecture, but has been observed as generally
1152 // being faster, particularly when the CLAST[AB] op is a loop-carried
1153 // dependency.
1154 Value *Pg = II.getArgOperand(0);
1155 Value *Fallback = II.getArgOperand(1);
1156 Value *Vec = II.getArgOperand(2);
1157 Type *Ty = II.getType();
1158
1159 if (!Ty->isIntegerTy())
1160 return std::nullopt;
1161
1162 Type *FPTy;
1163 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1164 default:
1165 return std::nullopt;
1166 case 16:
1167 FPTy = IC.Builder.getHalfTy();
1168 break;
1169 case 32:
1170 FPTy = IC.Builder.getFloatTy();
1171 break;
1172 case 64:
1173 FPTy = IC.Builder.getDoubleTy();
1174 break;
1175 }
1176
1177 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1178 auto *FPVTy = VectorType::get(
1179 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1180 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1181 auto *FPII = IC.Builder.CreateIntrinsic(
1182 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1183 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1184 return IC.replaceInstUsesWith(II, FPIItoInt);
1185}
1186
1187static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1188 IntrinsicInst &II) {
1189 LLVMContext &Ctx = II.getContext();
1190 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1191 // can work with RDFFR_PP for ptest elimination.
1192 auto *AllPat =
1193 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1194 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1195 {II.getType()}, {AllPat});
1196 auto *RDFFR =
1197 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1198 RDFFR->takeName(&II);
1199 return IC.replaceInstUsesWith(II, RDFFR);
1200}
1201
1202static std::optional<Instruction *>
1204 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1205
1206 if (Pattern == AArch64SVEPredPattern::all) {
1207 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1208 auto *VScale = IC.Builder.CreateVScale(StepVal);
1209 VScale->takeName(&II);
1210 return IC.replaceInstUsesWith(II, VScale);
1211 }
1212
1213 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1214
1215 return MinNumElts && NumElts >= MinNumElts
1216 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1217 II, ConstantInt::get(II.getType(), MinNumElts)))
1218 : std::nullopt;
1219}
1220
1221static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1222 IntrinsicInst &II) {
1223 Value *PgVal = II.getArgOperand(0);
1224 Value *OpVal = II.getArgOperand(1);
1225
1226 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1227 // Later optimizations prefer this form.
1228 if (PgVal == OpVal &&
1229 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1230 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1231 Value *Ops[] = {PgVal, OpVal};
1232 Type *Tys[] = {PgVal->getType()};
1233
1234 auto *PTest =
1235 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1236 PTest->takeName(&II);
1237
1238 return IC.replaceInstUsesWith(II, PTest);
1239 }
1240
1241 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1242 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1243
1244 if (!Pg || !Op)
1245 return std::nullopt;
1246
1247 Intrinsic::ID OpIID = Op->getIntrinsicID();
1248
1249 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1250 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1251 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1252 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1253 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1254
1255 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1256
1257 PTest->takeName(&II);
1258 return IC.replaceInstUsesWith(II, PTest);
1259 }
1260
1261 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1262 // Later optimizations may rewrite sequence to use the flag-setting variant
1263 // of instruction X to remove PTEST.
1264 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1265 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1266 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1267 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1268 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1269 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1270 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1271 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1272 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1273 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1274 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1275 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1276 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1277 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1278 Type *Tys[] = {Pg->getType()};
1279
1280 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1281 PTest->takeName(&II);
1282
1283 return IC.replaceInstUsesWith(II, PTest);
1284 }
1285
1286 return std::nullopt;
1287}
1288
1289template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1290static std::optional<Instruction *>
1292 bool MergeIntoAddendOp) {
1293 Value *P = II.getOperand(0);
1294 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1295 if (MergeIntoAddendOp) {
1296 AddendOp = II.getOperand(1);
1297 Mul = II.getOperand(2);
1298 } else {
1299 AddendOp = II.getOperand(2);
1300 Mul = II.getOperand(1);
1301 }
1302
1303 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1304 m_Value(MulOp1))))
1305 return std::nullopt;
1306
1307 if (!Mul->hasOneUse())
1308 return std::nullopt;
1309
1310 Instruction *FMFSource = nullptr;
1311 if (II.getType()->isFPOrFPVectorTy()) {
1312 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1313 // Stop the combine when the flags on the inputs differ in case dropping
1314 // flags would lead to us missing out on more beneficial optimizations.
1315 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1316 return std::nullopt;
1317 if (!FAddFlags.allowContract())
1318 return std::nullopt;
1319 FMFSource = &II;
1320 }
1321
1322 CallInst *Res;
1323 if (MergeIntoAddendOp)
1324 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1325 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1326 else
1327 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1328 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1329
1330 return IC.replaceInstUsesWith(II, Res);
1331}
1332
1333static std::optional<Instruction *>
1335 Value *Pred = II.getOperand(0);
1336 Value *PtrOp = II.getOperand(1);
1337 Type *VecTy = II.getType();
1338
1339 if (isAllActivePredicate(Pred)) {
1340 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1341 Load->copyMetadata(II);
1342 return IC.replaceInstUsesWith(II, Load);
1343 }
1344
1345 CallInst *MaskedLoad =
1346 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1347 Pred, ConstantAggregateZero::get(VecTy));
1348 MaskedLoad->copyMetadata(II);
1349 return IC.replaceInstUsesWith(II, MaskedLoad);
1350}
1351
1352static std::optional<Instruction *>
1354 Value *VecOp = II.getOperand(0);
1355 Value *Pred = II.getOperand(1);
1356 Value *PtrOp = II.getOperand(2);
1357
1358 if (isAllActivePredicate(Pred)) {
1359 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1360 Store->copyMetadata(II);
1361 return IC.eraseInstFromFunction(II);
1362 }
1363
1364 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1365 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1366 MaskedStore->copyMetadata(II);
1367 return IC.eraseInstFromFunction(II);
1368}
1369
1371 switch (Intrinsic) {
1372 case Intrinsic::aarch64_sve_fmul_u:
1373 return Instruction::BinaryOps::FMul;
1374 case Intrinsic::aarch64_sve_fadd_u:
1375 return Instruction::BinaryOps::FAdd;
1376 case Intrinsic::aarch64_sve_fsub_u:
1377 return Instruction::BinaryOps::FSub;
1378 default:
1379 return Instruction::BinaryOpsEnd;
1380 }
1381}
1382
1383static std::optional<Instruction *>
1385 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1386 if (II.isStrictFP())
1387 return std::nullopt;
1388
1389 auto *OpPredicate = II.getOperand(0);
1390 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1391 if (BinOpCode == Instruction::BinaryOpsEnd ||
1392 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1393 m_ConstantInt<AArch64SVEPredPattern::all>())))
1394 return std::nullopt;
1397 auto BinOp =
1398 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1399 return IC.replaceInstUsesWith(II, BinOp);
1400}
1401
1402// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1403// sve.add_u).
1404static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1405 Intrinsic::ID IID) {
1406 auto *OpPredicate = II.getOperand(0);
1407 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1408 m_ConstantInt<AArch64SVEPredPattern::all>())))
1409 return std::nullopt;
1410
1411 auto *Mod = II.getModule();
1412 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1413 II.setCalledFunction(NewDecl);
1414
1415 return &II;
1416}
1417
1418// Simplify operations where predicate has all inactive lanes or try to replace
1419// with _u form when all lanes are active
1420static std::optional<Instruction *>
1422 Intrinsic::ID IID) {
1423 if (match(II.getOperand(0), m_ZeroInt())) {
1424 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1425 // inactive for sv[func]_m
1426 return IC.replaceInstUsesWith(II, II.getOperand(1));
1427 }
1428 return instCombineSVEAllActive(II, IID);
1429}
1430
1431static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1432 IntrinsicInst &II) {
1433 if (auto II_U =
1434 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1435 return II_U;
1436 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1437 Intrinsic::aarch64_sve_mla>(
1438 IC, II, true))
1439 return MLA;
1440 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1441 Intrinsic::aarch64_sve_mad>(
1442 IC, II, false))
1443 return MAD;
1444 return std::nullopt;
1445}
1446
1447static std::optional<Instruction *>
1449 if (auto II_U =
1450 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1451 return II_U;
1452 if (auto FMLA =
1453 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1454 Intrinsic::aarch64_sve_fmla>(IC, II,
1455 true))
1456 return FMLA;
1457 if (auto FMAD =
1458 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1459 Intrinsic::aarch64_sve_fmad>(IC, II,
1460 false))
1461 return FMAD;
1462 if (auto FMLA =
1463 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1464 Intrinsic::aarch64_sve_fmla>(IC, II,
1465 true))
1466 return FMLA;
1467 return std::nullopt;
1468}
1469
1470static std::optional<Instruction *>
1472 if (auto FMLA =
1473 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1474 Intrinsic::aarch64_sve_fmla>(IC, II,
1475 true))
1476 return FMLA;
1477 if (auto FMAD =
1478 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1479 Intrinsic::aarch64_sve_fmad>(IC, II,
1480 false))
1481 return FMAD;
1482 if (auto FMLA_U =
1483 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1484 Intrinsic::aarch64_sve_fmla_u>(
1485 IC, II, true))
1486 return FMLA_U;
1487 return instCombineSVEVectorBinOp(IC, II);
1488}
1489
1490static std::optional<Instruction *>
1492 if (auto II_U =
1493 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1494 return II_U;
1495 if (auto FMLS =
1496 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1497 Intrinsic::aarch64_sve_fmls>(IC, II,
1498 true))
1499 return FMLS;
1500 if (auto FMSB =
1501 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1502 Intrinsic::aarch64_sve_fnmsb>(
1503 IC, II, false))
1504 return FMSB;
1505 if (auto FMLS =
1506 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1507 Intrinsic::aarch64_sve_fmls>(IC, II,
1508 true))
1509 return FMLS;
1510 return std::nullopt;
1511}
1512
1513static std::optional<Instruction *>
1515 if (auto FMLS =
1516 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1517 Intrinsic::aarch64_sve_fmls>(IC, II,
1518 true))
1519 return FMLS;
1520 if (auto FMSB =
1521 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1522 Intrinsic::aarch64_sve_fnmsb>(
1523 IC, II, false))
1524 return FMSB;
1525 if (auto FMLS_U =
1526 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1527 Intrinsic::aarch64_sve_fmls_u>(
1528 IC, II, true))
1529 return FMLS_U;
1530 return instCombineSVEVectorBinOp(IC, II);
1531}
1532
1533static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1534 IntrinsicInst &II) {
1535 if (auto II_U =
1536 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1537 return II_U;
1538 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1539 Intrinsic::aarch64_sve_mls>(
1540 IC, II, true))
1541 return MLS;
1542 return std::nullopt;
1543}
1544
1545static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1546 IntrinsicInst &II,
1547 Intrinsic::ID IID) {
1548 auto *OpPredicate = II.getOperand(0);
1549 auto *OpMultiplicand = II.getOperand(1);
1550 auto *OpMultiplier = II.getOperand(2);
1551
1552 // Return true if a given instruction is a unit splat value, false otherwise.
1553 auto IsUnitSplat = [](auto *I) {
1554 auto *SplatValue = getSplatValue(I);
1555 if (!SplatValue)
1556 return false;
1557 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1558 };
1559
1560 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1561 // with a unit splat value, false otherwise.
1562 auto IsUnitDup = [](auto *I) {
1563 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1564 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1565 return false;
1566
1567 auto *SplatValue = IntrI->getOperand(2);
1568 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1569 };
1570
1571 if (IsUnitSplat(OpMultiplier)) {
1572 // [f]mul pg %n, (dupx 1) => %n
1573 OpMultiplicand->takeName(&II);
1574 return IC.replaceInstUsesWith(II, OpMultiplicand);
1575 } else if (IsUnitDup(OpMultiplier)) {
1576 // [f]mul pg %n, (dup pg 1) => %n
1577 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1578 auto *DupPg = DupInst->getOperand(1);
1579 // TODO: this is naive. The optimization is still valid if DupPg
1580 // 'encompasses' OpPredicate, not only if they're the same predicate.
1581 if (OpPredicate == DupPg) {
1582 OpMultiplicand->takeName(&II);
1583 return IC.replaceInstUsesWith(II, OpMultiplicand);
1584 }
1585 }
1586
1587 return instCombineSVEVectorBinOp(IC, II);
1588}
1589
1590static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1591 IntrinsicInst &II) {
1592 Value *UnpackArg = II.getArgOperand(0);
1593 auto *RetTy = cast<ScalableVectorType>(II.getType());
1594 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1595 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1596
1597 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1598 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1599 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1600 ScalarArg =
1601 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1602 Value *NewVal =
1603 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1604 NewVal->takeName(&II);
1605 return IC.replaceInstUsesWith(II, NewVal);
1606 }
1607
1608 return std::nullopt;
1609}
1610static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1611 IntrinsicInst &II) {
1612 auto *OpVal = II.getOperand(0);
1613 auto *OpIndices = II.getOperand(1);
1614 VectorType *VTy = cast<VectorType>(II.getType());
1615
1616 // Check whether OpIndices is a constant splat value < minimal element count
1617 // of result.
1618 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1619 if (!SplatValue ||
1620 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1621 return std::nullopt;
1622
1623 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1624 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1625 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1626 auto *VectorSplat =
1627 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1628
1629 VectorSplat->takeName(&II);
1630 return IC.replaceInstUsesWith(II, VectorSplat);
1631}
1632
1633static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1634 IntrinsicInst &II) {
1635 Value *A, *B;
1636 Type *RetTy = II.getType();
1637 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1638 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1639
1640 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1641 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1642 if ((match(II.getArgOperand(0),
1643 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1644 match(II.getArgOperand(1),
1645 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1646 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1647 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1648 auto *TyA = cast<ScalableVectorType>(A->getType());
1649 if (TyA == B->getType() &&
1651 auto *SubVec = IC.Builder.CreateInsertVector(
1653 auto *ConcatVec = IC.Builder.CreateInsertVector(
1654 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1655 ConcatVec->takeName(&II);
1656 return IC.replaceInstUsesWith(II, ConcatVec);
1657 }
1658 }
1659
1660 return std::nullopt;
1661}
1662
1663static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1664 IntrinsicInst &II) {
1665 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1666 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1667 Value *A, *B;
1668 if (match(II.getArgOperand(0),
1669 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1670 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1671 m_Specific(A), m_Specific(B))))
1672 return IC.replaceInstUsesWith(
1673 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1674
1675 return std::nullopt;
1676}
1677
1678static std::optional<Instruction *>
1680 Value *Mask = II.getOperand(0);
1681 Value *BasePtr = II.getOperand(1);
1682 Value *Index = II.getOperand(2);
1683 Type *Ty = II.getType();
1684 Value *PassThru = ConstantAggregateZero::get(Ty);
1685
1686 // Contiguous gather => masked load.
1687 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1688 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1689 Value *IndexBase;
1690 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1691 m_Value(IndexBase), m_SpecificInt(1)))) {
1692 Align Alignment =
1693 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1694
1695 Type *VecPtrTy = PointerType::getUnqual(Ty);
1696 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1697 BasePtr, IndexBase);
1698 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1699 CallInst *MaskedLoad =
1700 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1701 MaskedLoad->takeName(&II);
1702 return IC.replaceInstUsesWith(II, MaskedLoad);
1703 }
1704
1705 return std::nullopt;
1706}
1707
1708static std::optional<Instruction *>
1710 Value *Val = II.getOperand(0);
1711 Value *Mask = II.getOperand(1);
1712 Value *BasePtr = II.getOperand(2);
1713 Value *Index = II.getOperand(3);
1714 Type *Ty = Val->getType();
1715
1716 // Contiguous scatter => masked store.
1717 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1718 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1719 Value *IndexBase;
1720 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1721 m_Value(IndexBase), m_SpecificInt(1)))) {
1722 Align Alignment =
1723 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1724
1725 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1726 BasePtr, IndexBase);
1727 Type *VecPtrTy = PointerType::getUnqual(Ty);
1728 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1729
1730 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1731
1732 return IC.eraseInstFromFunction(II);
1733 }
1734
1735 return std::nullopt;
1736}
1737
1738static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1739 IntrinsicInst &II) {
1741 Value *Pred = II.getOperand(0);
1742 Value *Vec = II.getOperand(1);
1743 Value *DivVec = II.getOperand(2);
1744
1745 Value *SplatValue = getSplatValue(DivVec);
1746 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1747 if (!SplatConstantInt)
1748 return std::nullopt;
1749 APInt Divisor = SplatConstantInt->getValue();
1750
1751 if (Divisor.isPowerOf2()) {
1752 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1753 auto ASRD = IC.Builder.CreateIntrinsic(
1754 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1755 return IC.replaceInstUsesWith(II, ASRD);
1756 }
1757 if (Divisor.isNegatedPowerOf2()) {
1758 Divisor.negate();
1759 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1760 auto ASRD = IC.Builder.CreateIntrinsic(
1761 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1762 auto NEG = IC.Builder.CreateIntrinsic(
1763 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1764 return IC.replaceInstUsesWith(II, NEG);
1765 }
1766
1767 return std::nullopt;
1768}
1769
1770bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1771 size_t VecSize = Vec.size();
1772 if (VecSize == 1)
1773 return true;
1774 if (!isPowerOf2_64(VecSize))
1775 return false;
1776 size_t HalfVecSize = VecSize / 2;
1777
1778 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1779 RHS != Vec.end(); LHS++, RHS++) {
1780 if (*LHS != nullptr && *RHS != nullptr) {
1781 if (*LHS == *RHS)
1782 continue;
1783 else
1784 return false;
1785 }
1786 if (!AllowPoison)
1787 return false;
1788 if (*LHS == nullptr && *RHS != nullptr)
1789 *LHS = *RHS;
1790 }
1791
1792 Vec.resize(HalfVecSize);
1793 SimplifyValuePattern(Vec, AllowPoison);
1794 return true;
1795}
1796
1797// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1798// to dupqlane(f64(C)) where C is A concatenated with B
1799static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1800 IntrinsicInst &II) {
1801 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1802 if (!match(II.getOperand(0),
1803 m_Intrinsic<Intrinsic::vector_insert>(
1804 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1805 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1806 return std::nullopt;
1807 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1808
1809 // Insert the scalars into a container ordered by InsertElement index
1810 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1811 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1812 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1813 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1814 CurrentInsertElt = InsertElt->getOperand(0);
1815 }
1816
1817 bool AllowPoison =
1818 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1819 if (!SimplifyValuePattern(Elts, AllowPoison))
1820 return std::nullopt;
1821
1822 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1823 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1824 for (size_t I = 0; I < Elts.size(); I++) {
1825 if (Elts[I] == nullptr)
1826 continue;
1827 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1828 IC.Builder.getInt64(I));
1829 }
1830 if (InsertEltChain == nullptr)
1831 return std::nullopt;
1832
1833 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1834 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1835 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1836 // be narrowed back to the original type.
1837 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1838 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1839 IIScalableTy->getMinNumElements() /
1840 PatternWidth;
1841
1842 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1843 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1844 auto *WideShuffleMaskTy =
1845 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1846
1847 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1848 auto InsertSubvector = IC.Builder.CreateInsertVector(
1849 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1850 auto WideBitcast =
1851 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1852 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1853 auto WideShuffle = IC.Builder.CreateShuffleVector(
1854 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1855 auto NarrowBitcast =
1856 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1857
1858 return IC.replaceInstUsesWith(II, NarrowBitcast);
1859}
1860
1861static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1862 IntrinsicInst &II) {
1863 Value *A = II.getArgOperand(0);
1864 Value *B = II.getArgOperand(1);
1865 if (A == B)
1866 return IC.replaceInstUsesWith(II, A);
1867
1868 return std::nullopt;
1869}
1870
1871static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1872 IntrinsicInst &II) {
1873 Value *Pred = II.getOperand(0);
1874 Value *Vec = II.getOperand(1);
1875 Value *Shift = II.getOperand(2);
1876
1877 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1878 Value *AbsPred, *MergedValue;
1879 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1880 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1881 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1882 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1883
1884 return std::nullopt;
1885
1886 // Transform is valid if any of the following are true:
1887 // * The ABS merge value is an undef or non-negative
1888 // * The ABS predicate is all active
1889 // * The ABS predicate and the SRSHL predicates are the same
1890 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1891 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1892 return std::nullopt;
1893
1894 // Only valid when the shift amount is non-negative, otherwise the rounding
1895 // behaviour of SRSHL cannot be ignored.
1896 if (!match(Shift, m_NonNegative()))
1897 return std::nullopt;
1898
1899 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1900 {II.getType()}, {Pred, Vec, Shift});
1901
1902 return IC.replaceInstUsesWith(II, LSL);
1903}
1904
1905std::optional<Instruction *>
1907 IntrinsicInst &II) const {
1908 Intrinsic::ID IID = II.getIntrinsicID();
1909 switch (IID) {
1910 default:
1911 break;
1912 case Intrinsic::aarch64_neon_fmaxnm:
1913 case Intrinsic::aarch64_neon_fminnm:
1914 return instCombineMaxMinNM(IC, II);
1915 case Intrinsic::aarch64_sve_convert_from_svbool:
1916 return instCombineConvertFromSVBool(IC, II);
1917 case Intrinsic::aarch64_sve_dup:
1918 return instCombineSVEDup(IC, II);
1919 case Intrinsic::aarch64_sve_dup_x:
1920 return instCombineSVEDupX(IC, II);
1921 case Intrinsic::aarch64_sve_cmpne:
1922 case Intrinsic::aarch64_sve_cmpne_wide:
1923 return instCombineSVECmpNE(IC, II);
1924 case Intrinsic::aarch64_sve_rdffr:
1925 return instCombineRDFFR(IC, II);
1926 case Intrinsic::aarch64_sve_lasta:
1927 case Intrinsic::aarch64_sve_lastb:
1928 return instCombineSVELast(IC, II);
1929 case Intrinsic::aarch64_sve_clasta_n:
1930 case Intrinsic::aarch64_sve_clastb_n:
1931 return instCombineSVECondLast(IC, II);
1932 case Intrinsic::aarch64_sve_cntd:
1933 return instCombineSVECntElts(IC, II, 2);
1934 case Intrinsic::aarch64_sve_cntw:
1935 return instCombineSVECntElts(IC, II, 4);
1936 case Intrinsic::aarch64_sve_cnth:
1937 return instCombineSVECntElts(IC, II, 8);
1938 case Intrinsic::aarch64_sve_cntb:
1939 return instCombineSVECntElts(IC, II, 16);
1940 case Intrinsic::aarch64_sve_ptest_any:
1941 case Intrinsic::aarch64_sve_ptest_first:
1942 case Intrinsic::aarch64_sve_ptest_last:
1943 return instCombineSVEPTest(IC, II);
1944 case Intrinsic::aarch64_sve_fabd:
1945 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
1946 case Intrinsic::aarch64_sve_fadd:
1947 return instCombineSVEVectorFAdd(IC, II);
1948 case Intrinsic::aarch64_sve_fadd_u:
1949 return instCombineSVEVectorFAddU(IC, II);
1950 case Intrinsic::aarch64_sve_fdiv:
1951 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
1952 case Intrinsic::aarch64_sve_fmax:
1953 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
1954 case Intrinsic::aarch64_sve_fmaxnm:
1955 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
1956 case Intrinsic::aarch64_sve_fmin:
1957 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
1958 case Intrinsic::aarch64_sve_fminnm:
1959 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
1960 case Intrinsic::aarch64_sve_fmla:
1961 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
1962 case Intrinsic::aarch64_sve_fmls:
1963 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
1964 case Intrinsic::aarch64_sve_fmul:
1965 if (auto II_U =
1966 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
1967 return II_U;
1968 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
1969 case Intrinsic::aarch64_sve_fmul_u:
1970 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
1971 case Intrinsic::aarch64_sve_fmulx:
1972 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
1973 case Intrinsic::aarch64_sve_fnmla:
1974 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
1975 case Intrinsic::aarch64_sve_fnmls:
1976 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
1977 case Intrinsic::aarch64_sve_fsub:
1978 return instCombineSVEVectorFSub(IC, II);
1979 case Intrinsic::aarch64_sve_fsub_u:
1980 return instCombineSVEVectorFSubU(IC, II);
1981 case Intrinsic::aarch64_sve_add:
1982 return instCombineSVEVectorAdd(IC, II);
1983 case Intrinsic::aarch64_sve_add_u:
1984 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
1985 Intrinsic::aarch64_sve_mla_u>(
1986 IC, II, true);
1987 case Intrinsic::aarch64_sve_mla:
1988 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
1989 case Intrinsic::aarch64_sve_mls:
1990 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
1991 case Intrinsic::aarch64_sve_mul:
1992 if (auto II_U =
1993 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
1994 return II_U;
1995 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
1996 case Intrinsic::aarch64_sve_mul_u:
1997 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
1998 case Intrinsic::aarch64_sve_sabd:
1999 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2000 case Intrinsic::aarch64_sve_smax:
2001 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2002 case Intrinsic::aarch64_sve_smin:
2003 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2004 case Intrinsic::aarch64_sve_smulh:
2005 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2006 case Intrinsic::aarch64_sve_sub:
2007 return instCombineSVEVectorSub(IC, II);
2008 case Intrinsic::aarch64_sve_sub_u:
2009 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2010 Intrinsic::aarch64_sve_mls_u>(
2011 IC, II, true);
2012 case Intrinsic::aarch64_sve_uabd:
2013 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2014 case Intrinsic::aarch64_sve_umax:
2015 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2016 case Intrinsic::aarch64_sve_umin:
2017 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2018 case Intrinsic::aarch64_sve_umulh:
2019 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2020 case Intrinsic::aarch64_sve_asr:
2021 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2022 case Intrinsic::aarch64_sve_lsl:
2023 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2024 case Intrinsic::aarch64_sve_lsr:
2025 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2026 case Intrinsic::aarch64_sve_and:
2027 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2028 case Intrinsic::aarch64_sve_bic:
2029 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2030 case Intrinsic::aarch64_sve_eor:
2031 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2032 case Intrinsic::aarch64_sve_orr:
2033 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2034 case Intrinsic::aarch64_sve_sqsub:
2035 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2036 case Intrinsic::aarch64_sve_uqsub:
2037 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2038 case Intrinsic::aarch64_sve_tbl:
2039 return instCombineSVETBL(IC, II);
2040 case Intrinsic::aarch64_sve_uunpkhi:
2041 case Intrinsic::aarch64_sve_uunpklo:
2042 case Intrinsic::aarch64_sve_sunpkhi:
2043 case Intrinsic::aarch64_sve_sunpklo:
2044 return instCombineSVEUnpack(IC, II);
2045 case Intrinsic::aarch64_sve_uzp1:
2046 return instCombineSVEUzp1(IC, II);
2047 case Intrinsic::aarch64_sve_zip1:
2048 case Intrinsic::aarch64_sve_zip2:
2049 return instCombineSVEZip(IC, II);
2050 case Intrinsic::aarch64_sve_ld1_gather_index:
2051 return instCombineLD1GatherIndex(IC, II);
2052 case Intrinsic::aarch64_sve_st1_scatter_index:
2053 return instCombineST1ScatterIndex(IC, II);
2054 case Intrinsic::aarch64_sve_ld1:
2055 return instCombineSVELD1(IC, II, DL);
2056 case Intrinsic::aarch64_sve_st1:
2057 return instCombineSVEST1(IC, II, DL);
2058 case Intrinsic::aarch64_sve_sdiv:
2059 return instCombineSVESDIV(IC, II);
2060 case Intrinsic::aarch64_sve_sel:
2061 return instCombineSVESel(IC, II);
2062 case Intrinsic::aarch64_sve_srshl:
2063 return instCombineSVESrshl(IC, II);
2064 case Intrinsic::aarch64_sve_dupq_lane:
2065 return instCombineSVEDupqLane(IC, II);
2066 }
2067
2068 return std::nullopt;
2069}
2070
2072 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2073 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2074 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2075 SimplifyAndSetOp) const {
2076 switch (II.getIntrinsicID()) {
2077 default:
2078 break;
2079 case Intrinsic::aarch64_neon_fcvtxn:
2080 case Intrinsic::aarch64_neon_rshrn:
2081 case Intrinsic::aarch64_neon_sqrshrn:
2082 case Intrinsic::aarch64_neon_sqrshrun:
2083 case Intrinsic::aarch64_neon_sqshrn:
2084 case Intrinsic::aarch64_neon_sqshrun:
2085 case Intrinsic::aarch64_neon_sqxtn:
2086 case Intrinsic::aarch64_neon_sqxtun:
2087 case Intrinsic::aarch64_neon_uqrshrn:
2088 case Intrinsic::aarch64_neon_uqshrn:
2089 case Intrinsic::aarch64_neon_uqxtn:
2090 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2091 break;
2092 }
2093
2094 return std::nullopt;
2095}
2096
2099 switch (K) {
2101 return TypeSize::getFixed(64);
2104 return TypeSize::getFixed(0);
2105
2106 if (ST->hasSVE())
2107 return TypeSize::getFixed(
2108 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2109
2110 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
2113 return TypeSize::getScalable(0);
2114
2115 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
2116 }
2117 llvm_unreachable("Unsupported register kind");
2118}
2119
2120bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2122 Type *SrcOverrideTy) {
2123 // A helper that returns a vector type from the given type. The number of
2124 // elements in type Ty determines the vector width.
2125 auto toVectorTy = [&](Type *ArgTy) {
2126 return VectorType::get(ArgTy->getScalarType(),
2127 cast<VectorType>(DstTy)->getElementCount());
2128 };
2129
2130 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2131 // i32, i64]. SVE doesn't generally have the same set of instructions to
2132 // perform an extend with the add/sub/mul. There are SMULLB style
2133 // instructions, but they operate on top/bottom, requiring some sort of lane
2134 // interleaving to be used with zext/sext.
2135 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2136 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2137 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2138 return false;
2139
2140 // Determine if the operation has a widening variant. We consider both the
2141 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2142 // instructions.
2143 //
2144 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2145 // verify that their extending operands are eliminated during code
2146 // generation.
2147 Type *SrcTy = SrcOverrideTy;
2148 switch (Opcode) {
2149 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2150 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2151 // The second operand needs to be an extend
2152 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2153 if (!SrcTy)
2154 SrcTy =
2155 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2156 } else
2157 return false;
2158 break;
2159 case Instruction::Mul: { // SMULL(2), UMULL(2)
2160 // Both operands need to be extends of the same type.
2161 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2162 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2163 if (!SrcTy)
2164 SrcTy =
2165 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2166 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2167 // If one of the operands is a Zext and the other has enough zero bits to
2168 // be treated as unsigned, we can still general a umull, meaning the zext
2169 // is free.
2170 KnownBits Known =
2171 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2172 if (Args[0]->getType()->getScalarSizeInBits() -
2173 Known.Zero.countLeadingOnes() >
2174 DstTy->getScalarSizeInBits() / 2)
2175 return false;
2176 if (!SrcTy)
2177 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2178 DstTy->getScalarSizeInBits() / 2));
2179 } else
2180 return false;
2181 break;
2182 }
2183 default:
2184 return false;
2185 }
2186
2187 // Legalize the destination type and ensure it can be used in a widening
2188 // operation.
2189 auto DstTyL = getTypeLegalizationCost(DstTy);
2190 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2191 return false;
2192
2193 // Legalize the source type and ensure it can be used in a widening
2194 // operation.
2195 assert(SrcTy && "Expected some SrcTy");
2196 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2197 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2198 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2199 return false;
2200
2201 // Get the total number of vector elements in the legalized types.
2202 InstructionCost NumDstEls =
2203 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2204 InstructionCost NumSrcEls =
2205 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2206
2207 // Return true if the legalized types have the same number of vector elements
2208 // and the destination element type size is twice that of the source type.
2209 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2210}
2211
2212// s/urhadd instructions implement the following pattern, making the
2213// extends free:
2214// %x = add ((zext i8 -> i16), 1)
2215// %y = (zext i8 -> i16)
2216// trunc i16 (lshr (add %x, %y), 1) -> i8
2217//
2219 Type *Src) {
2220 // The source should be a legal vector type.
2221 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2222 (Src->isScalableTy() && !ST->hasSVE2()))
2223 return false;
2224
2225 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2226 return false;
2227
2228 // Look for trunc/shl/add before trying to match the pattern.
2229 const Instruction *Add = ExtUser;
2230 auto *AddUser =
2231 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2232 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2233 Add = AddUser;
2234
2235 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2236 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2237 return false;
2238
2239 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2240 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2241 Src->getScalarSizeInBits() !=
2242 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2243 return false;
2244
2245 // Try to match the whole pattern. Ext could be either the first or second
2246 // m_ZExtOrSExt matched.
2247 Instruction *Ex1, *Ex2;
2248 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2249 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2250 return false;
2251
2252 // Ensure both extends are of the same type
2253 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2254 Ex1->getOpcode() == Ex2->getOpcode())
2255 return true;
2256
2257 return false;
2258}
2259
2261 Type *Src,
2264 const Instruction *I) {
2265 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2266 assert(ISD && "Invalid opcode");
2267 // If the cast is observable, and it is used by a widening instruction (e.g.,
2268 // uaddl, saddw, etc.), it may be free.
2269 if (I && I->hasOneUser()) {
2270 auto *SingleUser = cast<Instruction>(*I->user_begin());
2271 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2272 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2273 // For adds only count the second operand as free if both operands are
2274 // extends but not the same operation. (i.e both operands are not free in
2275 // add(sext, zext)).
2276 if (SingleUser->getOpcode() == Instruction::Add) {
2277 if (I == SingleUser->getOperand(1) ||
2278 (isa<CastInst>(SingleUser->getOperand(1)) &&
2279 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2280 return 0;
2281 } else // Others are free so long as isWideningInstruction returned true.
2282 return 0;
2283 }
2284
2285 // The cast will be free for the s/urhadd instructions
2286 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2287 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2288 return 0;
2289 }
2290
2291 // TODO: Allow non-throughput costs that aren't binary.
2292 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2294 return Cost == 0 ? 0 : 1;
2295 return Cost;
2296 };
2297
2298 EVT SrcTy = TLI->getValueType(DL, Src);
2299 EVT DstTy = TLI->getValueType(DL, Dst);
2300
2301 if (!SrcTy.isSimple() || !DstTy.isSimple())
2302 return AdjustCost(
2303 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2304
2305 static const TypeConversionCostTblEntry
2306 ConversionTbl[] = {
2307 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2308 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2309 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2310 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2311 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2312 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2313 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2314 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2315 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2316 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2317 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2318 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2319 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2320 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2321 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2322 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2323 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2324 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2325 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2326 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2327
2328 // Truncations on nxvmiN
2329 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2330 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2331 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2332 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2333 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2334 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2335 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2336 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2337 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2338 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2339 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2340 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2341 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2342 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2343 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2344 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2345
2346 // The number of shll instructions for the extension.
2347 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2348 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2349 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2350 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2351 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2352 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2353 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2354 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2355 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2356 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2357 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2358 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2359 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2360 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2361 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2362 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2363
2364 // LowerVectorINT_TO_FP:
2365 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2366 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2367 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2368 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2369 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2370 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2371
2372 // Complex: to v2f32
2373 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2374 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2375 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2376 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2377 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2378 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2379
2380 // Complex: to v4f32
2381 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2382 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2383 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2384 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2385
2386 // Complex: to v8f32
2387 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2388 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2389 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2390 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2391
2392 // Complex: to v16f32
2393 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2394 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2395
2396 // Complex: to v2f64
2397 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2398 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2399 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2400 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2401 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2402 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2403
2404 // Complex: to v4f64
2405 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2406 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2407
2408 // LowerVectorFP_TO_INT
2409 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2410 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2411 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2412 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2413 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2414 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2415
2416 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2417 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2418 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2419 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2420 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2421 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2422 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2423
2424 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2425 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2426 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2427 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2428 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2429
2430 // Complex, from nxv2f32.
2431 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2432 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2433 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2434 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2435 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2436 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2437 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2438 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2439
2440 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2441 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2442 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2443 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2444 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2445 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2446 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2447
2448 // Complex, from nxv2f64.
2449 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2450 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2451 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2452 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2453 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2454 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2455 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2456 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2457
2458 // Complex, from nxv4f32.
2459 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2460 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2461 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2462 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2463 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2464 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2465 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2466 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2467
2468 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2469 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2470 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2471 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2472 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2473
2474 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2475 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2476 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2477 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2478 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2479 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2480 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2481
2482 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2483 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2484 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2485 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2486 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2487
2488 // Complex, from nxv8f16.
2489 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2490 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2491 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2492 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2493 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2494 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2495 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2496 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2497
2498 // Complex, from nxv4f16.
2499 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2500 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2501 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2502 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2503 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2504 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2505 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2506 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2507
2508 // Complex, from nxv2f16.
2509 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2510 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2511 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2512 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2513 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2514 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2515 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2516 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2517
2518 // Truncate from nxvmf32 to nxvmf16.
2519 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2520 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2521 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2522
2523 // Truncate from nxvmf64 to nxvmf16.
2524 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2525 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2526 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2527
2528 // Truncate from nxvmf64 to nxvmf32.
2529 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2530 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2531 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2532
2533 // Extend from nxvmf16 to nxvmf32.
2534 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2535 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2536 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2537
2538 // Extend from nxvmf16 to nxvmf64.
2539 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2540 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2541 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2542
2543 // Extend from nxvmf32 to nxvmf64.
2544 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2545 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2546 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2547
2548 // Bitcasts from float to integer
2549 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2550 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2551 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2552
2553 // Bitcasts from integer to float
2554 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2555 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2556 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2557
2558 // Add cost for extending to illegal -too wide- scalable vectors.
2559 // zero/sign extend are implemented by multiple unpack operations,
2560 // where each operation has a cost of 1.
2561 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2562 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2563 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2564 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2565 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2566 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2567
2568 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2569 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2570 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2571 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2572 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2573 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2574 };
2575
2576 // We have to estimate a cost of fixed length operation upon
2577 // SVE registers(operations) with the number of registers required
2578 // for a fixed type to be represented upon SVE registers.
2579 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2580 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2581 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2582 ST->useSVEForFixedLengthVectors(WiderTy)) {
2583 std::pair<InstructionCost, MVT> LT =
2584 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2585 unsigned NumElements = AArch64::SVEBitsPerBlock /
2586 LT.second.getVectorElementType().getSizeInBits();
2587 return AdjustCost(
2588 LT.first *
2590 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2591 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2592 CostKind, I));
2593 }
2594
2595 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2596 DstTy.getSimpleVT(),
2597 SrcTy.getSimpleVT()))
2598 return AdjustCost(Entry->Cost);
2599
2600 static const TypeConversionCostTblEntry FP16Tbl[] = {
2601 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2602 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2603 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2604 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2605 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2606 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2607 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2608 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2609 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2610 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2611 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2612 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2613 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2614 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2615 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2616 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2617 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2618 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2619 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2620 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2621 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2622 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2623 };
2624
2625 if (ST->hasFullFP16())
2626 if (const auto *Entry = ConvertCostTableLookup(
2627 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2628 return AdjustCost(Entry->Cost);
2629
2630 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2631 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2632 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2634 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2636 // The standard behaviour in the backend for these cases is to split the
2637 // extend up into two parts:
2638 // 1. Perform an extending load or masked load up to the legal type.
2639 // 2. Extend the loaded data to the final type.
2640 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2641 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2643 Opcode, LegalTy, Src, CCH, CostKind, I);
2645 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2646 return Part1 + Part2;
2647 }
2648
2649 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2650 // but we also want to include the TTI::CastContextHint::Masked case too.
2651 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2652 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2653 TLI->isTypeLegal(DstTy))
2655
2656 return AdjustCost(
2657 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2658}
2659
2661 Type *Dst,
2662 VectorType *VecTy,
2663 unsigned Index) {
2664
2665 // Make sure we were given a valid extend opcode.
2666 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2667 "Invalid opcode");
2668
2669 // We are extending an element we extract from a vector, so the source type
2670 // of the extend is the element type of the vector.
2671 auto *Src = VecTy->getElementType();
2672
2673 // Sign- and zero-extends are for integer types only.
2674 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2675
2676 // Get the cost for the extract. We compute the cost (if any) for the extend
2677 // below.
2679 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2680 CostKind, Index, nullptr, nullptr);
2681
2682 // Legalize the types.
2683 auto VecLT = getTypeLegalizationCost(VecTy);
2684 auto DstVT = TLI->getValueType(DL, Dst);
2685 auto SrcVT = TLI->getValueType(DL, Src);
2686
2687 // If the resulting type is still a vector and the destination type is legal,
2688 // we may get the extension for free. If not, get the default cost for the
2689 // extend.
2690 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2691 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2692 CostKind);
2693
2694 // The destination type should be larger than the element type. If not, get
2695 // the default cost for the extend.
2696 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2697 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2698 CostKind);
2699
2700 switch (Opcode) {
2701 default:
2702 llvm_unreachable("Opcode should be either SExt or ZExt");
2703
2704 // For sign-extends, we only need a smov, which performs the extension
2705 // automatically.
2706 case Instruction::SExt:
2707 return Cost;
2708
2709 // For zero-extends, the extend is performed automatically by a umov unless
2710 // the destination type is i64 and the element type is i8 or i16.
2711 case Instruction::ZExt:
2712 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2713 return Cost;
2714 }
2715
2716 // If we are unable to perform the extend for free, get the default cost.
2717 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2718 CostKind);
2719}
2720
2723 const Instruction *I) {
2725 return Opcode == Instruction::PHI ? 0 : 1;
2726 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2727 // Branches are assumed to be predicted.
2728 return 0;
2729}
2730
2731InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2732 Type *Val,
2733 unsigned Index,
2734 bool HasRealUse) {
2735 assert(Val->isVectorTy() && "This must be a vector type");
2736
2737 if (Index != -1U) {
2738 // Legalize the type.
2739 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2740
2741 // This type is legalized to a scalar type.
2742 if (!LT.second.isVector())
2743 return 0;
2744
2745 // The type may be split. For fixed-width vectors we can normalize the
2746 // index to the new type.
2747 if (LT.second.isFixedLengthVector()) {
2748 unsigned Width = LT.second.getVectorNumElements();
2749 Index = Index % Width;
2750 }
2751
2752 // The element at index zero is already inside the vector.
2753 // - For a physical (HasRealUse==true) insert-element or extract-element
2754 // instruction that extracts integers, an explicit FPR -> GPR move is
2755 // needed. So it has non-zero cost.
2756 // - For the rest of cases (virtual instruction or element type is float),
2757 // consider the instruction free.
2758 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2759 return 0;
2760
2761 // This is recognising a LD1 single-element structure to one lane of one
2762 // register instruction. I.e., if this is an `insertelement` instruction,
2763 // and its second operand is a load, then we will generate a LD1, which
2764 // are expensive instructions.
2765 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2766 return ST->getVectorInsertExtractBaseCost() + 1;
2767
2768 // i1 inserts and extract will include an extra cset or cmp of the vector
2769 // value. Increase the cost by 1 to account.
2770 if (Val->getScalarSizeInBits() == 1)
2771 return ST->getVectorInsertExtractBaseCost() + 1;
2772
2773 // FIXME:
2774 // If the extract-element and insert-element instructions could be
2775 // simplified away (e.g., could be combined into users by looking at use-def
2776 // context), they have no cost. This is not done in the first place for
2777 // compile-time considerations.
2778 }
2779
2780 // All other insert/extracts cost this much.
2781 return ST->getVectorInsertExtractBaseCost();
2782}
2783
2786 unsigned Index, Value *Op0,
2787 Value *Op1) {
2788 bool HasRealUse =
2789 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2790 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2791}
2792
2794 Type *Val,
2796 unsigned Index) {
2797 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2798}
2799
2801 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
2803 if (isa<ScalableVectorType>(Ty))
2805 if (Ty->getElementType()->isFloatingPointTy())
2806 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
2807 CostKind);
2808 return DemandedElts.popcount() * (Insert + Extract) *
2810}
2811
2813 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2816 const Instruction *CxtI) {
2817
2818 // TODO: Handle more cost kinds.
2820 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2821 Op2Info, Args, CxtI);
2822
2823 // Legalize the type.
2824 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2825 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2826
2827 switch (ISD) {
2828 default:
2829 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2830 Op2Info);
2831 case ISD::SDIV:
2832 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2833 // On AArch64, scalar signed division by constants power-of-two are
2834 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2835 // The OperandValue properties many not be same as that of previous
2836 // operation; conservatively assume OP_None.
2838 Instruction::Add, Ty, CostKind,
2839 Op1Info.getNoProps(), Op2Info.getNoProps());
2840 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2841 Op1Info.getNoProps(), Op2Info.getNoProps());
2843 Instruction::Select, Ty, CostKind,
2844 Op1Info.getNoProps(), Op2Info.getNoProps());
2845 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2846 Op1Info.getNoProps(), Op2Info.getNoProps());
2847 return Cost;
2848 }
2849 [[fallthrough]];
2850 case ISD::UDIV: {
2851 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2852 auto VT = TLI->getValueType(DL, Ty);
2853 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2854 // Vector signed division by constant are expanded to the
2855 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2856 // to MULHS + SUB + SRL + ADD + SRL.
2858 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2860 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2862 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2863 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2864 }
2865 }
2866
2868 Opcode, Ty, CostKind, Op1Info, Op2Info);
2869 if (Ty->isVectorTy()) {
2870 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2871 // SDIV/UDIV operations are lowered using SVE, then we can have less
2872 // costs.
2873 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2874 ->getPrimitiveSizeInBits()
2875 .getFixedValue() < 128) {
2876 EVT VT = TLI->getValueType(DL, Ty);
2877 static const CostTblEntry DivTbl[]{
2878 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2879 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2880 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2881 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2882 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2883 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2884
2885 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2886 if (nullptr != Entry)
2887 return Entry->Cost;
2888 }
2889 // For 8/16-bit elements, the cost is higher because the type
2890 // requires promotion and possibly splitting:
2891 if (LT.second.getScalarType() == MVT::i8)
2892 Cost *= 8;
2893 else if (LT.second.getScalarType() == MVT::i16)
2894 Cost *= 4;
2895 return Cost;
2896 } else {
2897 // If one of the operands is a uniform constant then the cost for each
2898 // element is Cost for insertion, extraction and division.
2899 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2900 // operation with scalar type
2901 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2902 (Op2Info.isConstant() && Op2Info.isUniform())) {
2903 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2905 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2906 return (4 + DivCost) * VTy->getNumElements();
2907 }
2908 }
2909 // On AArch64, without SVE, vector divisions are expanded
2910 // into scalar divisions of each pair of elements.
2911 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2912 CostKind, Op1Info, Op2Info);
2913 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2914 Op1Info, Op2Info);
2915 }
2916
2917 // TODO: if one of the arguments is scalar, then it's not necessary to
2918 // double the cost of handling the vector elements.
2919 Cost += Cost;
2920 }
2921 return Cost;
2922 }
2923 case ISD::MUL:
2924 // When SVE is available, then we can lower the v2i64 operation using
2925 // the SVE mul instruction, which has a lower cost.
2926 if (LT.second == MVT::v2i64 && ST->hasSVE())
2927 return LT.first;
2928
2929 // When SVE is not available, there is no MUL.2d instruction,
2930 // which means mul <2 x i64> is expensive as elements are extracted
2931 // from the vectors and the muls scalarized.
2932 // As getScalarizationOverhead is a bit too pessimistic, we
2933 // estimate the cost for a i64 vector directly here, which is:
2934 // - four 2-cost i64 extracts,
2935 // - two 2-cost i64 inserts, and
2936 // - two 1-cost muls.
2937 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2938 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2939 // need to scalarize so the cost can be cheaper (smull or umull).
2940 // so the cost can be cheaper (smull or umull).
2941 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2942 return LT.first;
2943 return LT.first * 14;
2944 case ISD::ADD:
2945 case ISD::XOR:
2946 case ISD::OR:
2947 case ISD::AND:
2948 case ISD::SRL:
2949 case ISD::SRA:
2950 case ISD::SHL:
2951 // These nodes are marked as 'custom' for combining purposes only.
2952 // We know that they are legal. See LowerAdd in ISelLowering.
2953 return LT.first;
2954
2955 case ISD::FNEG:
2956 case ISD::FADD:
2957 case ISD::FSUB:
2958 // Increase the cost for half and bfloat types if not architecturally
2959 // supported.
2960 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
2961 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
2962 return 2 * LT.first;
2963 if (!Ty->getScalarType()->isFP128Ty())
2964 return LT.first;
2965 [[fallthrough]];
2966 case ISD::FMUL:
2967 case ISD::FDIV:
2968 // These nodes are marked as 'custom' just to lower them to SVE.
2969 // We know said lowering will incur no additional cost.
2970 if (!Ty->getScalarType()->isFP128Ty())
2971 return 2 * LT.first;
2972
2973 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2974 Op2Info);
2975 }
2976}
2977
2979 ScalarEvolution *SE,
2980 const SCEV *Ptr) {
2981 // Address computations in vectorized code with non-consecutive addresses will
2982 // likely result in more instructions compared to scalar code where the
2983 // computation can more often be merged into the index mode. The resulting
2984 // extra micro-ops can significantly decrease throughput.
2985 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
2986 int MaxMergeDistance = 64;
2987
2988 if (Ty->isVectorTy() && SE &&
2989 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2990 return NumVectorInstToHideOverhead;
2991
2992 // In many cases the address computation is not merged into the instruction
2993 // addressing mode.
2994 return 1;
2995}
2996
2998 Type *CondTy,
2999 CmpInst::Predicate VecPred,
3001 const Instruction *I) {
3002 // TODO: Handle other cost kinds.
3004 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3005 I);
3006
3007 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3008 // We don't lower some vector selects well that are wider than the register
3009 // width.
3010 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3011 // We would need this many instructions to hide the scalarization happening.
3012 const int AmortizationCost = 20;
3013
3014 // If VecPred is not set, check if we can get a predicate from the context
3015 // instruction, if its type matches the requested ValTy.
3016 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3017 CmpInst::Predicate CurrentPred;
3018 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3019 m_Value())))
3020 VecPred = CurrentPred;
3021 }
3022 // Check if we have a compare/select chain that can be lowered using
3023 // a (F)CMxx & BFI pair.
3024 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3025 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3026 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3027 VecPred == CmpInst::FCMP_UNE) {
3028 static const auto ValidMinMaxTys = {
3029 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3030 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3031 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3032
3033 auto LT = getTypeLegalizationCost(ValTy);
3034 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3035 (ST->hasFullFP16() &&
3036 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3037 return LT.first;
3038 }
3039
3040 static const TypeConversionCostTblEntry
3041 VectorSelectTbl[] = {
3042 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3043 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3044 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3045 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3046 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3047 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3048 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3049 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3050 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3051 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3052 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3053 };
3054
3055 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3056 EVT SelValTy = TLI->getValueType(DL, ValTy);
3057 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3058 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3059 SelCondTy.getSimpleVT(),
3060 SelValTy.getSimpleVT()))
3061 return Entry->Cost;
3062 }
3063 }
3064
3065 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3066 auto LT = getTypeLegalizationCost(ValTy);
3067 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3068 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3069 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3070 }
3071
3072 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3073 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3074 // be profitable.
3075 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3076 ICmpInst::isEquality(VecPred) &&
3077 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3078 match(I->getOperand(1), m_Zero()) &&
3079 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3080 return 0;
3081
3082 // The base case handles scalable vectors fine for now, since it treats the
3083 // cost as 1 * legalization cost.
3084 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3085}
3086
3088AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3090 if (ST->requiresStrictAlign()) {
3091 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3092 // a bunch of instructions when strict align is enabled.
3093 return Options;
3094 }
3095 Options.AllowOverlappingLoads = true;
3096 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3097 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3098 // TODO: Though vector loads usually perform well on AArch64, in some targets
3099 // they may wake up the FP unit, which raises the power consumption. Perhaps
3100 // they could be used with no holds barred (-O3).
3101 Options.LoadSizes = {8, 4, 2, 1};
3102 Options.AllowedTailExpansions = {3, 5, 6};
3103 return Options;
3104}
3105
3107 return ST->hasSVE();
3108}
3109
3112 Align Alignment, unsigned AddressSpace,
3114 if (useNeonVector(Src))
3115 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3116 CostKind);
3117 auto LT = getTypeLegalizationCost(Src);
3118 if (!LT.first.isValid())
3120
3121 // The code-generator is currently not able to handle scalable vectors
3122 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3123 // it. This change will be removed when code-generation for these types is
3124 // sufficiently reliable.
3125 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
3127
3128 return LT.first;
3129}
3130
3131static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3132 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3133}
3134
3136 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3137 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3138 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3139 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3140 Alignment, CostKind, I);
3141 auto *VT = cast<VectorType>(DataTy);
3142 auto LT = getTypeLegalizationCost(DataTy);
3143 if (!LT.first.isValid())
3145
3146 if (!LT.second.isVector() ||
3147 !isElementTypeLegalForScalableVector(VT->getElementType()))
3149
3150 // The code-generator is currently not able to handle scalable vectors
3151 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3152 // it. This change will be removed when code-generation for these types is
3153 // sufficiently reliable.
3154 if (cast<VectorType>(DataTy)->getElementCount() ==
3157
3158 ElementCount LegalVF = LT.second.getVectorElementCount();
3159 InstructionCost MemOpCost =
3160 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3161 {TTI::OK_AnyValue, TTI::OP_None}, I);
3162 // Add on an overhead cost for using gathers/scatters.
3163 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
3164 // point we may want a per-CPU overhead.
3165 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3166 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3167}
3168
3170 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3171}
3172
3174 MaybeAlign Alignment,
3175 unsigned AddressSpace,
3177 TTI::OperandValueInfo OpInfo,
3178 const Instruction *I) {
3179 EVT VT = TLI->getValueType(DL, Ty, true);
3180 // Type legalization can't handle structs
3181 if (VT == MVT::Other)
3182 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3183 CostKind);
3184
3185 auto LT = getTypeLegalizationCost(Ty);
3186 if (!LT.first.isValid())
3188
3189 // The code-generator is currently not able to handle scalable vectors
3190 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3191 // it. This change will be removed when code-generation for these types is
3192 // sufficiently reliable.
3193 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3194 if (VTy->getElementCount() == ElementCount::getScalable(1))
3196
3197 // TODO: consider latency as well for TCK_SizeAndLatency.
3199 return LT.first;
3200
3202 return 1;
3203
3204 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3205 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3206 // Unaligned stores are extremely inefficient. We don't split all
3207 // unaligned 128-bit stores because the negative impact that has shown in
3208 // practice on inlined block copy code.
3209 // We make such stores expensive so that we will only vectorize if there
3210 // are 6 other instructions getting vectorized.
3211 const int AmortizationCost = 6;
3212
3213 return LT.first * 2 * AmortizationCost;
3214 }
3215
3216 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3217 if (Ty->isPtrOrPtrVectorTy())
3218 return LT.first;
3219
3220 if (useNeonVector(Ty)) {
3221 // Check truncating stores and extending loads.
3222 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3223 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3224 if (VT == MVT::v4i8)
3225 return 2;
3226 // Otherwise we need to scalarize.
3227 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3228 }
3229 EVT EltVT = VT.getVectorElementType();
3230 unsigned EltSize = EltVT.getScalarSizeInBits();
3231 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3232 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3233 *Alignment != Align(1))
3234 return LT.first;
3235 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3236 // widening to v4i8, which produces suboptimal results.
3237 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3238 return LT.first;
3239
3240 // Check non-power-of-2 loads/stores for legal vector element types with
3241 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3242 // operations on smaller power-of-2 ops, including ld1/st1.
3243 LLVMContext &C = Ty->getContext();
3245 SmallVector<EVT> TypeWorklist;
3246 TypeWorklist.push_back(VT);
3247 while (!TypeWorklist.empty()) {
3248 EVT CurrVT = TypeWorklist.pop_back_val();
3249 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3250 if (isPowerOf2_32(CurrNumElements)) {
3251 Cost += 1;
3252 continue;
3253 }
3254
3255 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3256 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3257 TypeWorklist.push_back(
3258 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3259 }
3260 return Cost;
3261 }
3262
3263 return LT.first;
3264}
3265
3267 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3268 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3269 bool UseMaskForCond, bool UseMaskForGaps) {
3270 assert(Factor >= 2 && "Invalid interleave factor");
3271 auto *VecVTy = cast<VectorType>(VecTy);
3272
3273 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3275
3276 // Vectorization for masked interleaved accesses is only enabled for scalable
3277 // VF.
3278 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3280
3281 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3282 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3283 auto *SubVecTy =
3284 VectorType::get(VecVTy->getElementType(),
3285 VecVTy->getElementCount().divideCoefficientBy(Factor));
3286
3287 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3288 // Accesses having vector types that are a multiple of 128 bits can be
3289 // matched to more than one ldN/stN instruction.
3290 bool UseScalable;
3291 if (MinElts % Factor == 0 &&
3292 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3293 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3294 }
3295
3296 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3297 Alignment, AddressSpace, CostKind,
3298 UseMaskForCond, UseMaskForGaps);
3299}
3300
3305 for (auto *I : Tys) {
3306 if (!I->isVectorTy())
3307 continue;
3308 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3309 128)
3310 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3311 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3312 }
3313 return Cost;
3314}
3315
3317 return ST->getMaxInterleaveFactor();
3318}
3319
3320// For Falkor, we want to avoid having too many strided loads in a loop since
3321// that can exhaust the HW prefetcher resources. We adjust the unroller
3322// MaxCount preference below to attempt to ensure unrolling doesn't create too
3323// many strided loads.
3324static void
3327 enum { MaxStridedLoads = 7 };
3328 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3329 int StridedLoads = 0;
3330 // FIXME? We could make this more precise by looking at the CFG and
3331 // e.g. not counting loads in each side of an if-then-else diamond.
3332 for (const auto BB : L->blocks()) {
3333 for (auto &I : *BB) {
3334 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3335 if (!LMemI)
3336 continue;
3337
3338 Value *PtrValue = LMemI->getPointerOperand();
3339 if (L->isLoopInvariant(PtrValue))
3340 continue;
3341
3342 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3343 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3344 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3345 continue;
3346
3347 // FIXME? We could take pairing of unrolled load copies into account
3348 // by looking at the AddRec, but we would probably have to limit this
3349 // to loops with no stores or other memory optimization barriers.
3350 ++StridedLoads;
3351 // We've seen enough strided loads that seeing more won't make a
3352 // difference.
3353 if (StridedLoads > MaxStridedLoads / 2)
3354 return StridedLoads;
3355 }
3356 }
3357 return StridedLoads;
3358 };
3359
3360 int StridedLoads = countStridedLoads(L, SE);
3361 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3362 << " strided loads\n");
3363 // Pick the largest power of 2 unroll count that won't result in too many
3364 // strided loads.
3365 if (StridedLoads) {
3366 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3367 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3368 << UP.MaxCount << '\n');
3369 }
3370}
3371
3375 // Enable partial unrolling and runtime unrolling.
3376 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3377
3378 UP.UpperBound = true;
3379
3380 // For inner loop, it is more likely to be a hot one, and the runtime check
3381 // can be promoted out from LICM pass, so the overhead is less, let's try
3382 // a larger threshold to unroll more loops.
3383 if (L->getLoopDepth() > 1)
3384 UP.PartialThreshold *= 2;
3385
3386 // Disable partial & runtime unrolling on -Os.
3388
3392
3393 // Scan the loop: don't unroll loops with calls as this could prevent
3394 // inlining. Don't unroll vector loops either, as they don't benefit much from
3395 // unrolling.
3396 for (auto *BB : L->getBlocks()) {
3397 for (auto &I : *BB) {
3398 // Don't unroll vectorised loop.
3399 if (I.getType()->isVectorTy())
3400 return;
3401
3402 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3403 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3404 if (!isLoweredToCall(F))
3405 continue;
3406 }
3407 return;
3408 }
3409 }
3410 }
3411
3412 // Enable runtime unrolling for in-order models
3413 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3414 // checking for that case, we can ensure that the default behaviour is
3415 // unchanged
3417 !ST->getSchedModel().isOutOfOrder()) {
3418 UP.Runtime = true;
3419 UP.Partial = true;
3420 UP.UnrollRemainder = true;
3422
3423 UP.UnrollAndJam = true;
3425 }
3426}
3427
3431}
3432
3434 Type *ExpectedType) {
3435 switch (Inst->getIntrinsicID()) {
3436 default:
3437 return nullptr;
3438 case Intrinsic::aarch64_neon_st2:
3439 case Intrinsic::aarch64_neon_st3:
3440 case Intrinsic::aarch64_neon_st4: {
3441 // Create a struct type
3442 StructType *ST = dyn_cast<StructType>(ExpectedType);
3443 if (!ST)
3444 return nullptr;
3445 unsigned NumElts = Inst->arg_size() - 1;
3446 if (ST->getNumElements() != NumElts)
3447 return nullptr;
3448 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3449 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3450 return nullptr;
3451 }
3452 Value *Res = PoisonValue::get(ExpectedType);
3453 IRBuilder<> Builder(Inst);
3454 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3455 Value *L = Inst->getArgOperand(i);
3456 Res = Builder.CreateInsertValue(Res, L, i);
3457 }
3458 return Res;
3459 }
3460 case Intrinsic::aarch64_neon_ld2:
3461 case Intrinsic::aarch64_neon_ld3:
3462 case Intrinsic::aarch64_neon_ld4:
3463 if (Inst->getType() == ExpectedType)
3464 return Inst;
3465 return nullptr;
3466 }
3467}
3468
3470 MemIntrinsicInfo &Info) {
3471 switch (Inst->getIntrinsicID()) {
3472 default:
3473 break;
3474 case Intrinsic::aarch64_neon_ld2:
3475 case Intrinsic::aarch64_neon_ld3:
3476 case Intrinsic::aarch64_neon_ld4:
3477 Info.ReadMem = true;
3478 Info.WriteMem = false;
3479 Info.PtrVal = Inst->getArgOperand(0);
3480 break;
3481 case Intrinsic::aarch64_neon_st2:
3482 case Intrinsic::aarch64_neon_st3:
3483 case Intrinsic::aarch64_neon_st4:
3484 Info.ReadMem = false;
3485 Info.WriteMem = true;
3486 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3487 break;
3488 }
3489
3490 switch (Inst->getIntrinsicID()) {
3491 default:
3492 return false;
3493 case Intrinsic::aarch64_neon_ld2:
3494 case Intrinsic::aarch64_neon_st2:
3495 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3496 break;
3497 case Intrinsic::aarch64_neon_ld3:
3498 case Intrinsic::aarch64_neon_st3:
3499 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3500 break;
3501 case Intrinsic::aarch64_neon_ld4:
3502 case Intrinsic::aarch64_neon_st4:
3503 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3504 break;
3505 }
3506 return true;
3507}
3508
3509/// See if \p I should be considered for address type promotion. We check if \p
3510/// I is a sext with right type and used in memory accesses. If it used in a
3511/// "complex" getelementptr, we allow it to be promoted without finding other
3512/// sext instructions that sign extended the same initial value. A getelementptr
3513/// is considered as "complex" if it has more than 2 operands.
3515 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3516 bool Considerable = false;
3517 AllowPromotionWithoutCommonHeader = false;
3518 if (!isa<SExtInst>(&I))
3519 return false;
3520 Type *ConsideredSExtType =
3521 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3522 if (I.getType() != ConsideredSExtType)
3523 return false;
3524 // See if the sext is the one with the right type and used in at least one
3525 // GetElementPtrInst.
3526 for (const User *U : I.users()) {
3527 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3528 Considerable = true;
3529 // A getelementptr is considered as "complex" if it has more than 2
3530 // operands. We will promote a SExt used in such complex GEP as we
3531 // expect some computation to be merged if they are done on 64 bits.
3532 if (GEPInst->getNumOperands() > 2) {
3533 AllowPromotionWithoutCommonHeader = true;
3534 break;
3535 }
3536 }
3537 }
3538 return Considerable;
3539}
3540
3542 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3543 if (!VF.isScalable())
3544 return true;
3545
3546 Type *Ty = RdxDesc.getRecurrenceType();
3548 return false;
3549
3550 switch (RdxDesc.getRecurrenceKind()) {
3551 case RecurKind::Add:
3552 case RecurKind::FAdd:
3553 case RecurKind::And:
3554 case RecurKind::Or:
3555 case RecurKind::Xor:
3556 case RecurKind::SMin:
3557 case RecurKind::SMax:
3558 case RecurKind::UMin:
3559 case RecurKind::UMax:
3560 case RecurKind::FMin:
3561 case RecurKind::FMax:
3562 case RecurKind::FMulAdd:
3563 case RecurKind::IAnyOf:
3564 case RecurKind::FAnyOf:
3565 return true;
3566 default:
3567 return false;
3568 }
3569}
3570
3573 FastMathFlags FMF,
3575 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3576
3577 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3578 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3579
3580 InstructionCost LegalizationCost = 0;
3581 if (LT.first > 1) {
3582 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3583 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3584 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3585 }
3586
3587 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3588}
3589
3591 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3592 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3593 InstructionCost LegalizationCost = 0;
3594 if (LT.first > 1) {
3595 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3596 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3597 LegalizationCost *= LT.first - 1;
3598 }
3599
3600 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3601 assert(ISD && "Invalid opcode");
3602 // Add the final reduction cost for the legal horizontal reduction
3603 switch (ISD) {
3604 case ISD::ADD:
3605 case ISD::AND:
3606 case ISD::OR:
3607 case ISD::XOR:
3608 case ISD::FADD:
3609 return LegalizationCost + 2;
3610 default:
3612 }
3613}
3614
3617 std::optional<FastMathFlags> FMF,
3620 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3621 InstructionCost BaseCost =
3622 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3623 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3624 // end up vectorizing for more computationally intensive loops.
3625 return BaseCost + FixedVTy->getNumElements();
3626 }
3627
3628 if (Opcode != Instruction::FAdd)
3630
3631 auto *VTy = cast<ScalableVectorType>(ValTy);
3633 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3634 Cost *= getMaxNumElements(VTy->getElementCount());
3635 return Cost;
3636 }
3637
3638 if (isa<ScalableVectorType>(ValTy))
3639 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3640
3641 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3642 MVT MTy = LT.second;
3643 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3644 assert(ISD && "Invalid opcode");
3645
3646 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3647 // instructions as twice a normal vector add, plus 1 for each legalization
3648 // step (LT.first). This is the only arithmetic vector reduction operation for
3649 // which we have an instruction.
3650 // OR, XOR and AND costs should match the codegen from:
3651 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3652 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3653 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3654 static const CostTblEntry CostTblNoPairwise[]{
3655 {ISD::ADD, MVT::v8i8, 2},
3656 {ISD::ADD, MVT::v16i8, 2},
3657 {ISD::ADD, MVT::v4i16, 2},
3658 {ISD::ADD, MVT::v8i16, 2},
3659 {ISD::ADD, MVT::v4i32, 2},
3660 {ISD::ADD, MVT::v2i64, 2},
3661 {ISD::OR, MVT::v8i8, 15},
3662 {ISD::OR, MVT::v16i8, 17},
3663 {ISD::OR, MVT::v4i16, 7},
3664 {ISD::OR, MVT::v8i16, 9},
3665 {ISD::OR, MVT::v2i32, 3},
3666 {ISD::OR, MVT::v4i32, 5},
3667 {ISD::OR, MVT::v2i64, 3},
3668 {ISD::XOR, MVT::v8i8, 15},
3669 {ISD::XOR, MVT::v16i8, 17},
3670 {ISD::XOR, MVT::v4i16, 7},
3671 {ISD::XOR, MVT::v8i16, 9},
3672 {ISD::XOR, MVT::v2i32, 3},
3673 {ISD::XOR, MVT::v4i32, 5},
3674 {ISD::XOR, MVT::v2i64, 3},
3675 {ISD::AND, MVT::v8i8, 15},
3676 {ISD::AND, MVT::v16i8, 17},
3677 {ISD::AND, MVT::v4i16, 7},
3678 {ISD::AND, MVT::v8i16, 9},
3679 {ISD::AND, MVT::v2i32, 3},
3680 {ISD::AND, MVT::v4i32, 5},
3681 {ISD::AND, MVT::v2i64, 3},
3682 };
3683 switch (ISD) {
3684 default:
3685 break;
3686 case ISD::ADD:
3687 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3688 return (LT.first - 1) + Entry->Cost;
3689 break;
3690 case ISD::XOR:
3691 case ISD::AND:
3692 case ISD::OR:
3693 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3694 if (!Entry)
3695 break;
3696 auto *ValVTy = cast<FixedVectorType>(ValTy);
3697 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3698 isPowerOf2_32(ValVTy->getNumElements())) {
3699 InstructionCost ExtraCost = 0;
3700 if (LT.first != 1) {
3701 // Type needs to be split, so there is an extra cost of LT.first - 1
3702 // arithmetic ops.
3703 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3704 MTy.getVectorNumElements());
3705 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3706 ExtraCost *= LT.first - 1;
3707 }
3708 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3709 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3710 return Cost + ExtraCost;
3711 }
3712 break;
3713 }
3714 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3715}
3716
3718 static const CostTblEntry ShuffleTbl[] = {
3719 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3720 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3721 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3722 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3723 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3724 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3725 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3726 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3727 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3728 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3729 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3730 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3731 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3732 };
3733
3734 // The code-generator is currently not able to handle scalable vectors
3735 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3736 // it. This change will be removed when code-generation for these types is
3737 // sufficiently reliable.
3740
3741 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3742 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3744 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3745 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3746 : LT.second;
3747 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3748 InstructionCost LegalizationCost = 0;
3749 if (Index < 0) {
3750 LegalizationCost =
3751 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3753 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3755 }
3756
3757 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3758 // Cost performed on a promoted type.
3759 if (LT.second.getScalarType() == MVT::i1) {
3760 LegalizationCost +=
3761 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3763 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3765 }
3766 const auto *Entry =
3767 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3768 assert(Entry && "Illegal Type for Splice");
3769 LegalizationCost += Entry->Cost;
3770 return LegalizationCost * LT.first;
3771}
3772
3774 VectorType *Tp,
3775 ArrayRef<int> Mask,
3777 int Index, VectorType *SubTp,
3779 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3780 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3781 // into smaller vectors and sum the cost of each shuffle.
3782 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3783 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3784 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
3785 unsigned TpNumElts = Mask.size();
3786 unsigned LTNumElts = LT.second.getVectorNumElements();
3787 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3788 VectorType *NTp =
3789 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3791 for (unsigned N = 0; N < NumVecs; N++) {
3792 SmallVector<int> NMask;
3793 // Split the existing mask into chunks of size LTNumElts. Track the source
3794 // sub-vectors to ensure the result has at most 2 inputs.
3795 unsigned Source1, Source2;
3796 unsigned NumSources = 0;
3797 for (unsigned E = 0; E < LTNumElts; E++) {
3798 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3800 if (MaskElt < 0) {
3802 continue;
3803 }
3804
3805 // Calculate which source from the input this comes from and whether it
3806 // is new to us.
3807 unsigned Source = MaskElt / LTNumElts;
3808 if (NumSources == 0) {
3809 Source1 = Source;
3810 NumSources = 1;
3811 } else if (NumSources == 1 && Source != Source1) {
3812 Source2 = Source;
3813 NumSources = 2;
3814 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3815 NumSources++;
3816 }
3817
3818 // Add to the new mask. For the NumSources>2 case these are not correct,
3819 // but are only used for the modular lane number.
3820 if (Source == Source1)
3821 NMask.push_back(MaskElt % LTNumElts);
3822 else if (Source == Source2)
3823 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3824 else
3825 NMask.push_back(MaskElt % LTNumElts);
3826 }
3827 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3828 // getShuffleCost. If not then cost it using the worst case.
3829 if (NumSources <= 2)
3830 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3832 NTp, NMask, CostKind, 0, nullptr, Args);
3833 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3834 return ME.value() % LTNumElts == ME.index();
3835 }))
3836 Cost += LTNumElts - 1;
3837 else
3838 Cost += LTNumElts;
3839 }
3840 return Cost;
3841 }
3842
3843 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
3844 // Treat extractsubvector as single op permutation.
3845 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
3846 if (IsExtractSubvector && LT.second.isFixedLengthVector())
3848
3849 // Check for broadcast loads, which are supported by the LD1R instruction.
3850 // In terms of code-size, the shuffle vector is free when a load + dup get
3851 // folded into a LD1R. That's what we check and return here. For performance
3852 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3853 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3854 // that we model the load + dup sequence slightly higher because LD1R is a
3855 // high latency instruction.
3856 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3857 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3858 if (IsLoad && LT.second.isVector() &&
3860 LT.second.getVectorElementCount()))
3861 return 0;
3862 }
3863
3864 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3865 // from the perfect shuffle tables.
3866 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3867 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3868 all_of(Mask, [](int E) { return E < 8; }))
3869 return getPerfectShuffleCost(Mask);
3870
3871 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3872 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3873 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3874 static const CostTblEntry ShuffleTbl[] = {
3875 // Broadcast shuffle kinds can be performed with 'dup'.
3876 {TTI::SK_Broadcast, MVT::v8i8, 1},
3877 {TTI::SK_Broadcast, MVT::v16i8, 1},
3878 {TTI::SK_Broadcast, MVT::v4i16, 1},
3879 {TTI::SK_Broadcast, MVT::v8i16, 1},
3880 {TTI::SK_Broadcast, MVT::v2i32, 1},
3881 {TTI::SK_Broadcast, MVT::v4i32, 1},
3882 {TTI::SK_Broadcast, MVT::v2i64, 1},
3883 {TTI::SK_Broadcast, MVT::v4f16, 1},
3884 {TTI::SK_Broadcast, MVT::v8f16, 1},
3885 {TTI::SK_Broadcast, MVT::v2f32, 1},
3886 {TTI::SK_Broadcast, MVT::v4f32, 1},
3887 {TTI::SK_Broadcast, MVT::v2f64, 1},
3888 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3889 // 'zip1/zip2' instructions.
3890 {TTI::SK_Transpose, MVT::v8i8, 1},
3891 {TTI::SK_Transpose, MVT::v16i8, 1},
3892 {TTI::SK_Transpose, MVT::v4i16, 1},
3893 {TTI::SK_Transpose, MVT::v8i16, 1},
3894 {TTI::SK_Transpose, MVT::v2i32, 1},
3895 {TTI::SK_Transpose, MVT::v4i32, 1},
3896 {TTI::SK_Transpose, MVT::v2i64, 1},
3897 {TTI::SK_Transpose, MVT::v4f16, 1},
3898 {TTI::SK_Transpose, MVT::v8f16, 1},
3899 {TTI::SK_Transpose, MVT::v2f32, 1},
3900 {TTI::SK_Transpose, MVT::v4f32, 1},
3901 {TTI::SK_Transpose, MVT::v2f64, 1},
3902 // Select shuffle kinds.
3903 // TODO: handle vXi8/vXi16.
3904 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3905 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3906 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3907 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3908 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3909 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3910 // PermuteSingleSrc shuffle kinds.
3911 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3912 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3913 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3914 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3915 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3916 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3917 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3918 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3919 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3920 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3921 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3922 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3923 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3924 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3925 // Reverse can be lowered with `rev`.
3926 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3927 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3928 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3929 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3930 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3931 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3932 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3933 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3934 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3935 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3936 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3937 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3938 // Splice can all be lowered as `ext`.
3939 {TTI::SK_Splice, MVT::v2i32, 1},
3940 {TTI::SK_Splice, MVT::v4i32, 1},
3941 {TTI::SK_Splice, MVT::v2i64, 1},
3942 {TTI::SK_Splice, MVT::v2f32, 1},
3943 {TTI::SK_Splice, MVT::v4f32, 1},
3944 {TTI::SK_Splice, MVT::v2f64, 1},
3945 {TTI::SK_Splice, MVT::v8f16, 1},
3946 {TTI::SK_Splice, MVT::v8bf16, 1},
3947 {TTI::SK_Splice, MVT::v8i16, 1},
3948 {TTI::SK_Splice, MVT::v16i8, 1},
3949 {TTI::SK_Splice, MVT::v4bf16, 1},
3950 {TTI::SK_Splice, MVT::v4f16, 1},
3951 {TTI::SK_Splice, MVT::v4i16, 1},
3952 {TTI::SK_Splice, MVT::v8i8, 1},
3953 // Broadcast shuffle kinds for scalable vectors
3954 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
3955 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
3956 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
3957 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
3958 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
3959 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
3960 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
3961 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
3962 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
3963 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
3964 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
3965 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
3966 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
3967 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
3968 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
3969 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
3970 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
3971 // Handle the cases for vector.reverse with scalable vectors
3972 {TTI::SK_Reverse, MVT::nxv16i8, 1},
3973 {TTI::SK_Reverse, MVT::nxv8i16, 1},
3974 {TTI::SK_Reverse, MVT::nxv4i32, 1},
3975 {TTI::SK_Reverse, MVT::nxv2i64, 1},
3976 {TTI::SK_Reverse, MVT::nxv2f16, 1},
3977 {TTI::SK_Reverse, MVT::nxv4f16, 1},
3978 {TTI::SK_Reverse, MVT::nxv8f16, 1},
3979 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
3980 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
3981 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
3982 {TTI::SK_Reverse, MVT::nxv2f32, 1},
3983 {TTI::SK_Reverse, MVT::nxv4f32, 1},
3984 {TTI::SK_Reverse, MVT::nxv2f64, 1},
3985 {TTI::SK_Reverse, MVT::nxv16i1, 1},
3986 {TTI::SK_Reverse, MVT::nxv8i1, 1},
3987 {TTI::SK_Reverse, MVT::nxv4i1, 1},
3988 {TTI::SK_Reverse, MVT::nxv2i1, 1},
3989 };
3990 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3991 return LT.first * Entry->Cost;
3992 }
3993
3994 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3995 return getSpliceCost(Tp, Index);
3996
3997 // Inserting a subvector can often be done with either a D, S or H register
3998 // move, so long as the inserted vector is "aligned".
3999 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4000 LT.second.getSizeInBits() <= 128 && SubTp) {
4001 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4002 if (SubLT.second.isVector()) {
4003 int NumElts = LT.second.getVectorNumElements();
4004 int NumSubElts = SubLT.second.getVectorNumElements();
4005 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4006 return SubLT.first;
4007 }
4008 }
4009
4010 // Restore optimal kind.
4011 if (IsExtractSubvector)
4013 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
4014}
4015
4018 const auto &Strides = DenseMap<Value *, const SCEV *>();
4019 for (BasicBlock *BB : TheLoop->blocks()) {
4020 // Scan the instructions in the block and look for addresses that are
4021 // consecutive and decreasing.
4022 for (Instruction &I : *BB) {
4023 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4025 Type *AccessTy = getLoadStoreType(&I);
4026 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4027 /*ShouldCheckWrap=*/false)
4028 .value_or(0) < 0)
4029 return true;
4030 }
4031 }
4032 }
4033 return false;
4034}
4035
4037 if (!ST->hasSVE())
4038 return false;
4039
4040 // We don't currently support vectorisation with interleaving for SVE - with
4041 // such loops we're better off not using tail-folding. This gives us a chance
4042 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4043 if (TFI->IAI->hasGroups())
4044 return false;
4045
4047 if (TFI->LVL->getReductionVars().size())
4048 Required |= TailFoldingOpts::Reductions;
4049 if (TFI->LVL->getFixedOrderRecurrences().size())
4050 Required |= TailFoldingOpts::Recurrences;
4051
4052 // We call this to discover whether any load/store pointers in the loop have
4053 // negative strides. This will require extra work to reverse the loop
4054 // predicate, which may be expensive.
4057 Required |= TailFoldingOpts::Reverse;
4058 if (Required == TailFoldingOpts::Disabled)
4059 Required |= TailFoldingOpts::Simple;
4060
4062 Required))
4063 return false;
4064
4065 // Don't tail-fold for tight loops where we would be better off interleaving
4066 // with an unpredicated loop.
4067 unsigned NumInsns = 0;
4068 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4069 NumInsns += BB->sizeWithoutDebug();
4070 }
4071
4072 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4073 return NumInsns >= SVETailFoldInsnThreshold;
4074}
4075
4078 int64_t BaseOffset, bool HasBaseReg,
4079 int64_t Scale, unsigned AddrSpace) const {
4080 // Scaling factors are not free at all.
4081 // Operands | Rt Latency
4082 // -------------------------------------------
4083 // Rt, [Xn, Xm] | 4
4084 // -------------------------------------------
4085 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4086 // Rt, [Xn, Wm, <extend> #imm] |
4088 AM.BaseGV = BaseGV;
4089 AM.BaseOffs = BaseOffset;
4090 AM.HasBaseReg = HasBaseReg;
4091 AM.Scale = Scale;
4092 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4093 // Scale represents reg2 * scale, thus account for 1 if
4094 // it is not equal to 0 or 1.
4095 return AM.Scale != 0 && AM.Scale != 1;
4096 return -1;
4097}
4098
4100 // For the binary operators (e.g. or) we need to be more careful than
4101 // selects, here we only transform them if they are already at a natural
4102 // break point in the code - the end of a block with an unconditional
4103 // terminator.
4104 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4105 isa<BranchInst>(I->getNextNode()) &&
4106 cast<BranchInst>(I->getNextNode())->isUnconditional())
4107 return true;
4109}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
IntegerType * Int32Ty
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1614
unsigned countLeadingOnes() const
Definition: APInt.h:1568
void negate()
Negate this APInt in place.
Definition: APInt.h:1415
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1696
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:576
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:885
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:963
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:756
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:335
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:648
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:849
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:248
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1259
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1481
bool isStrictFP() const
Determine if the call requires strict floating point semantics.
Definition: InstrTypes.h:1951
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1426
unsigned arg_size() const
Definition: InstrTypes.h:1424
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1520
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:780
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:783
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:786
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:784
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:785
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:787
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:796
bool isIntPredicate() const
Definition: InstrTypes.h:888
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1579
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:137
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:299
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:296
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:949
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:88
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2455
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2506
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1039
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2443
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:533
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:553
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1212
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:930
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1108
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:520
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:538
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:305
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:525
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:485
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2188
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2110
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1789
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2477
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1802
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:548
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1660
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2179
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1865
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:47
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:385
BuilderTy & Builder
Definition: InstCombiner.h:60
static InstructionCost getInvalid(CostType Val=0)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:71
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:250
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:645
const SmallVectorImpl< Type * > & getArgTypes() const