LLVM 19.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62 cl::init(true), cl::Hidden);
63
64namespace {
65class TailFoldingOption {
66 // These bitfields will only ever be set to something non-zero in operator=,
67 // when setting the -sve-tail-folding option. This option should always be of
68 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
69 // InitialBits is one of (disabled|all|simple). EnableBits represents
70 // additional flags we're enabling, and DisableBits for those flags we're
71 // disabling. The default flag is tracked in the variable NeedsDefault, since
72 // at the time of setting the option we may not know what the default value
73 // for the CPU is.
74 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
75 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
76 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
77
78 // This value needs to be initialised to true in case the user does not
79 // explicitly set the -sve-tail-folding option.
80 bool NeedsDefault = true;
81
82 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
83
84 void setNeedsDefault(bool V) { NeedsDefault = V; }
85
86 void setEnableBit(TailFoldingOpts Bit) {
87 EnableBits |= Bit;
88 DisableBits &= ~Bit;
89 }
90
91 void setDisableBit(TailFoldingOpts Bit) {
92 EnableBits &= ~Bit;
93 DisableBits |= Bit;
94 }
95
96 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
97 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
98
99 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
100 "Initial bits should only include one of "
101 "(disabled|all|simple|default)");
102 Bits = NeedsDefault ? DefaultBits : InitialBits;
103 Bits |= EnableBits;
104 Bits &= ~DisableBits;
105
106 return Bits;
107 }
108
109 void reportError(std::string Opt) {
110 errs() << "invalid argument '" << Opt
111 << "' to -sve-tail-folding=; the option should be of the form\n"
112 " (disabled|all|default|simple)[+(reductions|recurrences"
113 "|reverse|noreductions|norecurrences|noreverse)]\n";
114 report_fatal_error("Unrecognised tail-folding option");
115 }
116
117public:
118
119 void operator=(const std::string &Val) {
120 // If the user explicitly sets -sve-tail-folding= then treat as an error.
121 if (Val.empty()) {
122 reportError("");
123 return;
124 }
125
126 // Since the user is explicitly setting the option we don't automatically
127 // need the default unless they require it.
128 setNeedsDefault(false);
129
130 SmallVector<StringRef, 4> TailFoldTypes;
131 StringRef(Val).split(TailFoldTypes, '+', -1, false);
132
133 unsigned StartIdx = 1;
134 if (TailFoldTypes[0] == "disabled")
135 setInitialBits(TailFoldingOpts::Disabled);
136 else if (TailFoldTypes[0] == "all")
137 setInitialBits(TailFoldingOpts::All);
138 else if (TailFoldTypes[0] == "default")
139 setNeedsDefault(true);
140 else if (TailFoldTypes[0] == "simple")
141 setInitialBits(TailFoldingOpts::Simple);
142 else {
143 StartIdx = 0;
144 setInitialBits(TailFoldingOpts::Disabled);
145 }
146
147 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
148 if (TailFoldTypes[I] == "reductions")
149 setEnableBit(TailFoldingOpts::Reductions);
150 else if (TailFoldTypes[I] == "recurrences")
151 setEnableBit(TailFoldingOpts::Recurrences);
152 else if (TailFoldTypes[I] == "reverse")
153 setEnableBit(TailFoldingOpts::Reverse);
154 else if (TailFoldTypes[I] == "noreductions")
155 setDisableBit(TailFoldingOpts::Reductions);
156 else if (TailFoldTypes[I] == "norecurrences")
157 setDisableBit(TailFoldingOpts::Recurrences);
158 else if (TailFoldTypes[I] == "noreverse")
159 setDisableBit(TailFoldingOpts::Reverse);
160 else
161 reportError(Val);
162 }
163 }
164
165 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
166 return (getBits(DefaultBits) & Required) == Required;
167 }
168};
169} // namespace
170
171TailFoldingOption TailFoldingOptionLoc;
172
174 "sve-tail-folding",
175 cl::desc(
176 "Control the use of vectorisation using tail-folding for SVE where the"
177 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
178 "\ndisabled (Initial) No loop types will vectorize using "
179 "tail-folding"
180 "\ndefault (Initial) Uses the default tail-folding settings for "
181 "the target CPU"
182 "\nall (Initial) All legal loop types will vectorize using "
183 "tail-folding"
184 "\nsimple (Initial) Use tail-folding for simple loops (not "
185 "reductions or recurrences)"
186 "\nreductions Use tail-folding for loops containing reductions"
187 "\nnoreductions Inverse of above"
188 "\nrecurrences Use tail-folding for loops containing fixed order "
189 "recurrences"
190 "\nnorecurrences Inverse of above"
191 "\nreverse Use tail-folding for loops requiring reversed "
192 "predicates"
193 "\nnoreverse Inverse of above"),
195
196// Experimental option that will only be fully functional when the
197// code-generator is changed to use SVE instead of NEON for all fixed-width
198// operations.
200 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
201
202// Experimental option that will only be fully functional when the cost-model
203// and code-generator have been changed to avoid using scalable vector
204// instructions that are not legal in streaming SVE mode.
206 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
207
208static bool isSMEABIRoutineCall(const CallInst &CI) {
209 const auto *F = CI.getCalledFunction();
210 return F && StringSwitch<bool>(F->getName())
211 .Case("__arm_sme_state", true)
212 .Case("__arm_tpidr2_save", true)
213 .Case("__arm_tpidr2_restore", true)
214 .Case("__arm_za_disable", true)
215 .Default(false);
216}
217
218/// Returns true if the function has explicit operations that can only be
219/// lowered using incompatible instructions for the selected mode. This also
220/// returns true if the function F may use or modify ZA state.
222 for (const BasicBlock &BB : *F) {
223 for (const Instruction &I : BB) {
224 // Be conservative for now and assume that any call to inline asm or to
225 // intrinsics could could result in non-streaming ops (e.g. calls to
226 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
227 // all native LLVM instructions can be lowered to compatible instructions.
228 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
229 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
230 isSMEABIRoutineCall(cast<CallInst>(I))))
231 return true;
232 }
233 }
234 return false;
235}
236
238 const Function *Callee) const {
239 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
240
241 // When inlining, we should consider the body of the function, not the
242 // interface.
243 if (CalleeAttrs.hasStreamingBody()) {
244 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
245 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
246 }
247
248 if (CalleeAttrs.isNewZA())
249 return false;
250
251 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
252 CallerAttrs.requiresSMChange(CalleeAttrs)) {
253 if (hasPossibleIncompatibleOps(Callee))
254 return false;
255 }
256
257 const TargetMachine &TM = getTLI()->getTargetMachine();
258
259 const FeatureBitset &CallerBits =
260 TM.getSubtargetImpl(*Caller)->getFeatureBits();
261 const FeatureBitset &CalleeBits =
262 TM.getSubtargetImpl(*Callee)->getFeatureBits();
263
264 // Inline a callee if its target-features are a subset of the callers
265 // target-features.
266 return (CallerBits & CalleeBits) == CalleeBits;
267}
268
270 const Function *Caller, const Function *Callee,
271 const ArrayRef<Type *> &Types) const {
272 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
273 return false;
274
275 // We need to ensure that argument promotion does not attempt to promote
276 // pointers to fixed-length vector types larger than 128 bits like
277 // <8 x float> (and pointers to aggregate types which have such fixed-length
278 // vector type members) into the values of the pointees. Such vector types
279 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
280 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
281 // types can be safely treated as 128-bit NEON types and they cannot be
282 // distinguished in IR.
283 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
284 auto FVTy = dyn_cast<FixedVectorType>(Ty);
285 return FVTy &&
286 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
287 }))
288 return false;
289
290 return true;
291}
292
293unsigned
295 unsigned DefaultCallPenalty) const {
296 // This function calculates a penalty for executing Call in F.
297 //
298 // There are two ways this function can be called:
299 // (1) F:
300 // call from F -> G (the call here is Call)
301 //
302 // For (1), Call.getCaller() == F, so it will always return a high cost if
303 // a streaming-mode change is required (thus promoting the need to inline the
304 // function)
305 //
306 // (2) F:
307 // call from F -> G (the call here is not Call)
308 // G:
309 // call from G -> H (the call here is Call)
310 //
311 // For (2), if after inlining the body of G into F the call to H requires a
312 // streaming-mode change, and the call to G from F would also require a
313 // streaming-mode change, then there is benefit to do the streaming-mode
314 // change only once and avoid inlining of G into F.
315 SMEAttrs FAttrs(*F);
316 SMEAttrs CalleeAttrs(Call);
317 if (FAttrs.requiresSMChange(CalleeAttrs)) {
318 if (F == Call.getCaller()) // (1)
319 return CallPenaltyChangeSM * DefaultCallPenalty;
320 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
321 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
322 }
323
324 return DefaultCallPenalty;
325}
326
331 ST->isNeonAvailable());
332}
333
334/// Calculate the cost of materializing a 64-bit value. This helper
335/// method might only calculate a fraction of a larger immediate. Therefore it
336/// is valid to return a cost of ZERO.
338 // Check if the immediate can be encoded within an instruction.
339 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
340 return 0;
341
342 if (Val < 0)
343 Val = ~Val;
344
345 // Calculate how many moves we will need to materialize this constant.
348 return Insn.size();
349}
350
351/// Calculate the cost of materializing the given constant.
354 assert(Ty->isIntegerTy());
355
356 unsigned BitSize = Ty->getPrimitiveSizeInBits();
357 if (BitSize == 0)
358 return ~0U;
359
360 // Sign-extend all constants to a multiple of 64-bit.
361 APInt ImmVal = Imm;
362 if (BitSize & 0x3f)
363 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
364
365 // Split the constant into 64-bit chunks and calculate the cost for each
366 // chunk.
368 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
369 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
370 int64_t Val = Tmp.getSExtValue();
371 Cost += getIntImmCost(Val);
372 }
373 // We need at least one instruction to materialze the constant.
374 return std::max<InstructionCost>(1, Cost);
375}
376
378 const APInt &Imm, Type *Ty,
380 Instruction *Inst) {
381 assert(Ty->isIntegerTy());
382
383 unsigned BitSize = Ty->getPrimitiveSizeInBits();
384 // There is no cost model for constants with a bit size of 0. Return TCC_Free
385 // here, so that constant hoisting will ignore this constant.
386 if (BitSize == 0)
387 return TTI::TCC_Free;
388
389 unsigned ImmIdx = ~0U;
390 switch (Opcode) {
391 default:
392 return TTI::TCC_Free;
393 case Instruction::GetElementPtr:
394 // Always hoist the base address of a GetElementPtr.
395 if (Idx == 0)
396 return 2 * TTI::TCC_Basic;
397 return TTI::TCC_Free;
398 case Instruction::Store:
399 ImmIdx = 0;
400 break;
401 case Instruction::Add:
402 case Instruction::Sub:
403 case Instruction::Mul:
404 case Instruction::UDiv:
405 case Instruction::SDiv:
406 case Instruction::URem:
407 case Instruction::SRem:
408 case Instruction::And:
409 case Instruction::Or:
410 case Instruction::Xor:
411 case Instruction::ICmp:
412 ImmIdx = 1;
413 break;
414 // Always return TCC_Free for the shift value of a shift instruction.
415 case Instruction::Shl:
416 case Instruction::LShr:
417 case Instruction::AShr:
418 if (Idx == 1)
419 return TTI::TCC_Free;
420 break;
421 case Instruction::Trunc:
422 case Instruction::ZExt:
423 case Instruction::SExt:
424 case Instruction::IntToPtr:
425 case Instruction::PtrToInt:
426 case Instruction::BitCast:
427 case Instruction::PHI:
428 case Instruction::Call:
429 case Instruction::Select:
430 case Instruction::Ret:
431 case Instruction::Load:
432 break;
433 }
434
435 if (Idx == ImmIdx) {
436 int NumConstants = (BitSize + 63) / 64;
438 return (Cost <= NumConstants * TTI::TCC_Basic)
439 ? static_cast<int>(TTI::TCC_Free)
440 : Cost;
441 }
443}
444
447 const APInt &Imm, Type *Ty,
449 assert(Ty->isIntegerTy());
450
451 unsigned BitSize = Ty->getPrimitiveSizeInBits();
452 // There is no cost model for constants with a bit size of 0. Return TCC_Free
453 // here, so that constant hoisting will ignore this constant.
454 if (BitSize == 0)
455 return TTI::TCC_Free;
456
457 // Most (all?) AArch64 intrinsics do not support folding immediates into the
458 // selected instruction, so we compute the materialization cost for the
459 // immediate directly.
460 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
462
463 switch (IID) {
464 default:
465 return TTI::TCC_Free;
466 case Intrinsic::sadd_with_overflow:
467 case Intrinsic::uadd_with_overflow:
468 case Intrinsic::ssub_with_overflow:
469 case Intrinsic::usub_with_overflow:
470 case Intrinsic::smul_with_overflow:
471 case Intrinsic::umul_with_overflow:
472 if (Idx == 1) {
473 int NumConstants = (BitSize + 63) / 64;
475 return (Cost <= NumConstants * TTI::TCC_Basic)
476 ? static_cast<int>(TTI::TCC_Free)
477 : Cost;
478 }
479 break;
480 case Intrinsic::experimental_stackmap:
481 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
482 return TTI::TCC_Free;
483 break;
484 case Intrinsic::experimental_patchpoint_void:
485 case Intrinsic::experimental_patchpoint:
486 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
487 return TTI::TCC_Free;
488 break;
489 case Intrinsic::experimental_gc_statepoint:
490 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
491 return TTI::TCC_Free;
492 break;
493 }
495}
496
499 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
500 if (TyWidth == 32 || TyWidth == 64)
502 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
503 return TTI::PSK_Software;
504}
505
506static bool isUnpackedVectorVT(EVT VecVT) {
507 return VecVT.isScalableVector() &&
509}
510
514 auto *RetTy = ICA.getReturnType();
515 switch (ICA.getID()) {
516 case Intrinsic::umin:
517 case Intrinsic::umax:
518 case Intrinsic::smin:
519 case Intrinsic::smax: {
520 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
521 MVT::v8i16, MVT::v2i32, MVT::v4i32,
522 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
523 MVT::nxv2i64};
525 // v2i64 types get converted to cmp+bif hence the cost of 2
526 if (LT.second == MVT::v2i64)
527 return LT.first * 2;
528 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
529 return LT.first;
530 break;
531 }
532 case Intrinsic::sadd_sat:
533 case Intrinsic::ssub_sat:
534 case Intrinsic::uadd_sat:
535 case Intrinsic::usub_sat: {
536 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
537 MVT::v8i16, MVT::v2i32, MVT::v4i32,
538 MVT::v2i64};
540 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
541 // need to extend the type, as it uses shr(qadd(shl, shl)).
542 unsigned Instrs =
543 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
544 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
545 return LT.first * Instrs;
546 break;
547 }
548 case Intrinsic::abs: {
549 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
550 MVT::v8i16, MVT::v2i32, MVT::v4i32,
551 MVT::v2i64};
553 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
554 return LT.first;
555 break;
556 }
557 case Intrinsic::bswap: {
558 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
559 MVT::v4i32, MVT::v2i64};
561 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
562 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
563 return LT.first;
564 break;
565 }
566 case Intrinsic::experimental_stepvector: {
567 InstructionCost Cost = 1; // Cost of the `index' instruction
569 // Legalisation of illegal vectors involves an `index' instruction plus
570 // (LT.first - 1) vector adds.
571 if (LT.first > 1) {
572 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
573 InstructionCost AddCost =
574 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
575 Cost += AddCost * (LT.first - 1);
576 }
577 return Cost;
578 }
579 case Intrinsic::vector_extract:
580 case Intrinsic::vector_insert: {
581 // If both the vector and subvector types are legal types and the index
582 // is 0, then this should be a no-op or simple operation; return a
583 // relatively low cost.
584
585 // If arguments aren't actually supplied, then we cannot determine the
586 // value of the index. We also want to skip predicate types.
587 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
589 break;
590
591 LLVMContext &C = RetTy->getContext();
592 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
593 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
594 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
595 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
596 // Skip this if either the vector or subvector types are unpacked
597 // SVE types; they may get lowered to stack stores and loads.
598 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
599 break;
600
602 getTLI()->getTypeConversion(C, SubVecVT);
604 getTLI()->getTypeConversion(C, VecVT);
605 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
606 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
607 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
608 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
609 return TTI::TCC_Free;
610 break;
611 }
612 case Intrinsic::bitreverse: {
613 static const CostTblEntry BitreverseTbl[] = {
614 {Intrinsic::bitreverse, MVT::i32, 1},
615 {Intrinsic::bitreverse, MVT::i64, 1},
616 {Intrinsic::bitreverse, MVT::v8i8, 1},
617 {Intrinsic::bitreverse, MVT::v16i8, 1},
618 {Intrinsic::bitreverse, MVT::v4i16, 2},
619 {Intrinsic::bitreverse, MVT::v8i16, 2},
620 {Intrinsic::bitreverse, MVT::v2i32, 2},
621 {Intrinsic::bitreverse, MVT::v4i32, 2},
622 {Intrinsic::bitreverse, MVT::v1i64, 2},
623 {Intrinsic::bitreverse, MVT::v2i64, 2},
624 };
625 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
626 const auto *Entry =
627 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
628 if (Entry) {
629 // Cost Model is using the legal type(i32) that i8 and i16 will be
630 // converted to +1 so that we match the actual lowering cost
631 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
632 TLI->getValueType(DL, RetTy, true) == MVT::i16)
633 return LegalisationCost.first * Entry->Cost + 1;
634
635 return LegalisationCost.first * Entry->Cost;
636 }
637 break;
638 }
639 case Intrinsic::ctpop: {
640 if (!ST->hasNEON()) {
641 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
642 return getTypeLegalizationCost(RetTy).first * 12;
643 }
644 static const CostTblEntry CtpopCostTbl[] = {
645 {ISD::CTPOP, MVT::v2i64, 4},
646 {ISD::CTPOP, MVT::v4i32, 3},
647 {ISD::CTPOP, MVT::v8i16, 2},
648 {ISD::CTPOP, MVT::v16i8, 1},
649 {ISD::CTPOP, MVT::i64, 4},
650 {ISD::CTPOP, MVT::v2i32, 3},
651 {ISD::CTPOP, MVT::v4i16, 2},
652 {ISD::CTPOP, MVT::v8i8, 1},
653 {ISD::CTPOP, MVT::i32, 5},
654 };
656 MVT MTy = LT.second;
657 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
658 // Extra cost of +1 when illegal vector types are legalized by promoting
659 // the integer type.
660 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
661 RetTy->getScalarSizeInBits()
662 ? 1
663 : 0;
664 return LT.first * Entry->Cost + ExtraCost;
665 }
666 break;
667 }
668 case Intrinsic::sadd_with_overflow:
669 case Intrinsic::uadd_with_overflow:
670 case Intrinsic::ssub_with_overflow:
671 case Intrinsic::usub_with_overflow:
672 case Intrinsic::smul_with_overflow:
673 case Intrinsic::umul_with_overflow: {
674 static const CostTblEntry WithOverflowCostTbl[] = {
675 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
676 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
677 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
678 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
679 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
680 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
681 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
682 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
683 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
684 {Intrinsic::usub_with_overflow, MVT::i8, 3},
685 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
686 {Intrinsic::usub_with_overflow, MVT::i16, 3},
687 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
688 {Intrinsic::usub_with_overflow, MVT::i32, 1},
689 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
690 {Intrinsic::usub_with_overflow, MVT::i64, 1},
691 {Intrinsic::smul_with_overflow, MVT::i8, 5},
692 {Intrinsic::umul_with_overflow, MVT::i8, 4},
693 {Intrinsic::smul_with_overflow, MVT::i16, 5},
694 {Intrinsic::umul_with_overflow, MVT::i16, 4},
695 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
696 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
697 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
698 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
699 };
700 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
701 if (MTy.isSimple())
702 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
703 MTy.getSimpleVT()))
704 return Entry->Cost;
705 break;
706 }
707 case Intrinsic::fptosi_sat:
708 case Intrinsic::fptoui_sat: {
709 if (ICA.getArgTypes().empty())
710 break;
711 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
712 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
713 EVT MTy = TLI->getValueType(DL, RetTy);
714 // Check for the legal types, which are where the size of the input and the
715 // output are the same, or we are using cvt f64->i32 or f32->i64.
716 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
717 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
718 LT.second == MVT::v2f64) &&
719 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
720 (LT.second == MVT::f64 && MTy == MVT::i32) ||
721 (LT.second == MVT::f32 && MTy == MVT::i64)))
722 return LT.first;
723 // Similarly for fp16 sizes
724 if (ST->hasFullFP16() &&
725 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
726 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
727 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
728 return LT.first;
729
730 // Otherwise we use a legal convert followed by a min+max
731 if ((LT.second.getScalarType() == MVT::f32 ||
732 LT.second.getScalarType() == MVT::f64 ||
733 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
734 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
735 Type *LegalTy =
736 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
737 if (LT.second.isVector())
738 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
740 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
741 LegalTy, {LegalTy, LegalTy});
743 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
744 LegalTy, {LegalTy, LegalTy});
746 return LT.first * Cost;
747 }
748 break;
749 }
750 case Intrinsic::fshl:
751 case Intrinsic::fshr: {
752 if (ICA.getArgs().empty())
753 break;
754
755 // TODO: Add handling for fshl where third argument is not a constant.
756 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
757 if (!OpInfoZ.isConstant())
758 break;
759
760 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
761 if (OpInfoZ.isUniform()) {
762 // FIXME: The costs could be lower if the codegen is better.
763 static const CostTblEntry FshlTbl[] = {
764 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
765 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
766 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
767 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
768 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
769 // to avoid having to duplicate the costs.
770 const auto *Entry =
771 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
772 if (Entry)
773 return LegalisationCost.first * Entry->Cost;
774 }
775
776 auto TyL = getTypeLegalizationCost(RetTy);
777 if (!RetTy->isIntegerTy())
778 break;
779
780 // Estimate cost manually, as types like i8 and i16 will get promoted to
781 // i32 and CostTableLookup will ignore the extra conversion cost.
782 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
783 RetTy->getScalarSizeInBits() < 64) ||
784 (RetTy->getScalarSizeInBits() % 64 != 0);
785 unsigned ExtraCost = HigherCost ? 1 : 0;
786 if (RetTy->getScalarSizeInBits() == 32 ||
787 RetTy->getScalarSizeInBits() == 64)
788 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
789 // extr instruction.
790 else if (HigherCost)
791 ExtraCost = 1;
792 else
793 break;
794 return TyL.first + ExtraCost;
795 }
796 case Intrinsic::get_active_lane_mask: {
797 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
798 if (RetTy) {
799 EVT RetVT = getTLI()->getValueType(DL, RetTy);
800 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
801 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
802 !getTLI()->isTypeLegal(RetVT)) {
803 // We don't have enough context at this point to determine if the mask
804 // is going to be kept live after the block, which will force the vXi1
805 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
806 // For now, we just assume the vectorizer created this intrinsic and
807 // the result will be the input for a PHI. In this case the cost will
808 // be extremely high for fixed-width vectors.
809 // NOTE: getScalarizationOverhead returns a cost that's far too
810 // pessimistic for the actual generated codegen. In reality there are
811 // two instructions generated per lane.
812 return RetTy->getNumElements() * 2;
813 }
814 }
815 break;
816 }
817 default:
818 break;
819 }
821}
822
823/// The function will remove redundant reinterprets casting in the presence
824/// of the control flow
825static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
826 IntrinsicInst &II) {
828 auto RequiredType = II.getType();
829
830 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
831 assert(PN && "Expected Phi Node!");
832
833 // Don't create a new Phi unless we can remove the old one.
834 if (!PN->hasOneUse())
835 return std::nullopt;
836
837 for (Value *IncValPhi : PN->incoming_values()) {
838 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
839 if (!Reinterpret ||
840 Reinterpret->getIntrinsicID() !=
841 Intrinsic::aarch64_sve_convert_to_svbool ||
842 RequiredType != Reinterpret->getArgOperand(0)->getType())
843 return std::nullopt;
844 }
845
846 // Create the new Phi
847 IC.Builder.SetInsertPoint(PN);
848 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
849 Worklist.push_back(PN);
850
851 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
852 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
853 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
854 Worklist.push_back(Reinterpret);
855 }
856
857 // Cleanup Phi Node and reinterprets
858 return IC.replaceInstUsesWith(II, NPN);
859}
860
861// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
862// => (binop (pred) (from_svbool _) (from_svbool _))
863//
864// The above transformation eliminates a `to_svbool` in the predicate
865// operand of bitwise operation `binop` by narrowing the vector width of
866// the operation. For example, it would convert a `<vscale x 16 x i1>
867// and` into a `<vscale x 4 x i1> and`. This is profitable because
868// to_svbool must zero the new lanes during widening, whereas
869// from_svbool is free.
870static std::optional<Instruction *>
872 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
873 if (!BinOp)
874 return std::nullopt;
875
876 auto IntrinsicID = BinOp->getIntrinsicID();
877 switch (IntrinsicID) {
878 case Intrinsic::aarch64_sve_and_z:
879 case Intrinsic::aarch64_sve_bic_z:
880 case Intrinsic::aarch64_sve_eor_z:
881 case Intrinsic::aarch64_sve_nand_z:
882 case Intrinsic::aarch64_sve_nor_z:
883 case Intrinsic::aarch64_sve_orn_z:
884 case Intrinsic::aarch64_sve_orr_z:
885 break;
886 default:
887 return std::nullopt;
888 }
889
890 auto BinOpPred = BinOp->getOperand(0);
891 auto BinOpOp1 = BinOp->getOperand(1);
892 auto BinOpOp2 = BinOp->getOperand(2);
893
894 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
895 if (!PredIntr ||
896 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
897 return std::nullopt;
898
899 auto PredOp = PredIntr->getOperand(0);
900 auto PredOpTy = cast<VectorType>(PredOp->getType());
901 if (PredOpTy != II.getType())
902 return std::nullopt;
903
904 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
905 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
906 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
907 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
908 if (BinOpOp1 == BinOpOp2)
909 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
910 else
911 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
912 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
913
914 auto NarrowedBinOp =
915 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
916 return IC.replaceInstUsesWith(II, NarrowedBinOp);
917}
918
919static std::optional<Instruction *>
921 // If the reinterpret instruction operand is a PHI Node
922 if (isa<PHINode>(II.getArgOperand(0)))
923 return processPhiNode(IC, II);
924
925 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
926 return BinOpCombine;
927
928 // Ignore converts to/from svcount_t.
929 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
930 isa<TargetExtType>(II.getType()))
931 return std::nullopt;
932
933 SmallVector<Instruction *, 32> CandidatesForRemoval;
934 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
935
936 const auto *IVTy = cast<VectorType>(II.getType());
937
938 // Walk the chain of conversions.
939 while (Cursor) {
940 // If the type of the cursor has fewer lanes than the final result, zeroing
941 // must take place, which breaks the equivalence chain.
942 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
943 if (CursorVTy->getElementCount().getKnownMinValue() <
944 IVTy->getElementCount().getKnownMinValue())
945 break;
946
947 // If the cursor has the same type as I, it is a viable replacement.
948 if (Cursor->getType() == IVTy)
949 EarliestReplacement = Cursor;
950
951 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
952
953 // If this is not an SVE conversion intrinsic, this is the end of the chain.
954 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
955 Intrinsic::aarch64_sve_convert_to_svbool ||
956 IntrinsicCursor->getIntrinsicID() ==
957 Intrinsic::aarch64_sve_convert_from_svbool))
958 break;
959
960 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
961 Cursor = IntrinsicCursor->getOperand(0);
962 }
963
964 // If no viable replacement in the conversion chain was found, there is
965 // nothing to do.
966 if (!EarliestReplacement)
967 return std::nullopt;
968
969 return IC.replaceInstUsesWith(II, EarliestReplacement);
970}
971
972static bool isAllActivePredicate(Value *Pred) {
973 // Look through convert.from.svbool(convert.to.svbool(...) chain.
974 Value *UncastedPred;
975 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
976 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
977 m_Value(UncastedPred)))))
978 // If the predicate has the same or less lanes than the uncasted
979 // predicate then we know the casting has no effect.
980 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
981 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
982 Pred = UncastedPred;
983
984 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
985 m_ConstantInt<AArch64SVEPredPattern::all>()));
986}
987
988// Simplify unary operation where predicate has all inactive lanes by replacing
989// instruction with zeroed object
990static std::optional<Instruction *>
992 if (match(II.getOperand(0), m_ZeroInt())) {
993 Constant *Node;
994 Type *RetTy = II.getType();
995 if (RetTy->isStructTy()) {
996 auto StructT = cast<StructType>(RetTy);
997 auto VecT = StructT->getElementType(0);
999 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1000 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1001 : ConstantInt::get(VecT, 0));
1002 }
1003 Node = ConstantStruct::get(StructT, ZerVec);
1004 } else if (RetTy->isFPOrFPVectorTy())
1005 Node = ConstantFP::get(RetTy, 0.0);
1006 else
1007 Node = ConstantInt::get(II.getType(), 0);
1008
1010 return IC.eraseInstFromFunction(II);
1011 }
1012 return std::nullopt;
1013}
1014
1015static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1016 IntrinsicInst &II) {
1017 // svsel(ptrue, x, y) => x
1018 auto *OpPredicate = II.getOperand(0);
1019 if (isAllActivePredicate(OpPredicate))
1020 return IC.replaceInstUsesWith(II, II.getOperand(1));
1021
1022 auto Select =
1023 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1024 return IC.replaceInstUsesWith(II, Select);
1025}
1026
1027static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1028 IntrinsicInst &II) {
1029 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1030 if (!Pg)
1031 return std::nullopt;
1032
1033 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1034 return std::nullopt;
1035
1036 const auto PTruePattern =
1037 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1038 if (PTruePattern != AArch64SVEPredPattern::vl1)
1039 return std::nullopt;
1040
1041 // The intrinsic is inserting into lane zero so use an insert instead.
1042 auto *IdxTy = Type::getInt64Ty(II.getContext());
1043 auto *Insert = InsertElementInst::Create(
1044 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1045 Insert->insertBefore(&II);
1046 Insert->takeName(&II);
1047
1048 return IC.replaceInstUsesWith(II, Insert);
1049}
1050
1051static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1052 IntrinsicInst &II) {
1053 // Replace DupX with a regular IR splat.
1054 auto *RetTy = cast<ScalableVectorType>(II.getType());
1055 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1056 II.getArgOperand(0));
1057 Splat->takeName(&II);
1058 return IC.replaceInstUsesWith(II, Splat);
1059}
1060
1061static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1062 IntrinsicInst &II) {
1063 LLVMContext &Ctx = II.getContext();
1064
1065 // Check that the predicate is all active
1066 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1067 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1068 return std::nullopt;
1069
1070 const auto PTruePattern =
1071 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1072 if (PTruePattern != AArch64SVEPredPattern::all)
1073 return std::nullopt;
1074
1075 // Check that we have a compare of zero..
1076 auto *SplatValue =
1077 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1078 if (!SplatValue || !SplatValue->isZero())
1079 return std::nullopt;
1080
1081 // ..against a dupq
1082 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1083 if (!DupQLane ||
1084 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1085 return std::nullopt;
1086
1087 // Where the dupq is a lane 0 replicate of a vector insert
1088 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1089 return std::nullopt;
1090
1091 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1092 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1093 return std::nullopt;
1094
1095 // Where the vector insert is a fixed constant vector insert into undef at
1096 // index zero
1097 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1098 return std::nullopt;
1099
1100 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1101 return std::nullopt;
1102
1103 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1104 if (!ConstVec)
1105 return std::nullopt;
1106
1107 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1108 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1109 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1110 return std::nullopt;
1111
1112 unsigned NumElts = VecTy->getNumElements();
1113 unsigned PredicateBits = 0;
1114
1115 // Expand intrinsic operands to a 16-bit byte level predicate
1116 for (unsigned I = 0; I < NumElts; ++I) {
1117 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1118 if (!Arg)
1119 return std::nullopt;
1120 if (!Arg->isZero())
1121 PredicateBits |= 1 << (I * (16 / NumElts));
1122 }
1123
1124 // If all bits are zero bail early with an empty predicate
1125 if (PredicateBits == 0) {
1126 auto *PFalse = Constant::getNullValue(II.getType());
1127 PFalse->takeName(&II);
1128 return IC.replaceInstUsesWith(II, PFalse);
1129 }
1130
1131 // Calculate largest predicate type used (where byte predicate is largest)
1132 unsigned Mask = 8;
1133 for (unsigned I = 0; I < 16; ++I)
1134 if ((PredicateBits & (1 << I)) != 0)
1135 Mask |= (I % 8);
1136
1137 unsigned PredSize = Mask & -Mask;
1138 auto *PredType = ScalableVectorType::get(
1139 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1140
1141 // Ensure all relevant bits are set
1142 for (unsigned I = 0; I < 16; I += PredSize)
1143 if ((PredicateBits & (1 << I)) == 0)
1144 return std::nullopt;
1145
1146 auto *PTruePat =
1147 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1148 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1149 {PredType}, {PTruePat});
1150 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1151 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1152 auto *ConvertFromSVBool =
1153 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1154 {II.getType()}, {ConvertToSVBool});
1155
1156 ConvertFromSVBool->takeName(&II);
1157 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1158}
1159
1160static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1161 IntrinsicInst &II) {
1162 Value *Pg = II.getArgOperand(0);
1163 Value *Vec = II.getArgOperand(1);
1164 auto IntrinsicID = II.getIntrinsicID();
1165 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1166
1167 // lastX(splat(X)) --> X
1168 if (auto *SplatVal = getSplatValue(Vec))
1169 return IC.replaceInstUsesWith(II, SplatVal);
1170
1171 // If x and/or y is a splat value then:
1172 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1173 Value *LHS, *RHS;
1174 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1175 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1176 auto *OldBinOp = cast<BinaryOperator>(Vec);
1177 auto OpC = OldBinOp->getOpcode();
1178 auto *NewLHS =
1179 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1180 auto *NewRHS =
1181 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1183 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1184 return IC.replaceInstUsesWith(II, NewBinOp);
1185 }
1186 }
1187
1188 auto *C = dyn_cast<Constant>(Pg);
1189 if (IsAfter && C && C->isNullValue()) {
1190 // The intrinsic is extracting lane 0 so use an extract instead.
1191 auto *IdxTy = Type::getInt64Ty(II.getContext());
1192 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1193 Extract->insertBefore(&II);
1194 Extract->takeName(&II);
1195 return IC.replaceInstUsesWith(II, Extract);
1196 }
1197
1198 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1199 if (!IntrPG)
1200 return std::nullopt;
1201
1202 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1203 return std::nullopt;
1204
1205 const auto PTruePattern =
1206 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1207
1208 // Can the intrinsic's predicate be converted to a known constant index?
1209 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1210 if (!MinNumElts)
1211 return std::nullopt;
1212
1213 unsigned Idx = MinNumElts - 1;
1214 // Increment the index if extracting the element after the last active
1215 // predicate element.
1216 if (IsAfter)
1217 ++Idx;
1218
1219 // Ignore extracts whose index is larger than the known minimum vector
1220 // length. NOTE: This is an artificial constraint where we prefer to
1221 // maintain what the user asked for until an alternative is proven faster.
1222 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1223 if (Idx >= PgVTy->getMinNumElements())
1224 return std::nullopt;
1225
1226 // The intrinsic is extracting a fixed lane so use an extract instead.
1227 auto *IdxTy = Type::getInt64Ty(II.getContext());
1228 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1229 Extract->insertBefore(&II);
1230 Extract->takeName(&II);
1231 return IC.replaceInstUsesWith(II, Extract);
1232}
1233
1234static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1235 IntrinsicInst &II) {
1236 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1237 // integer variant across a variety of micro-architectures. Replace scalar
1238 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1239 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1240 // depending on the micro-architecture, but has been observed as generally
1241 // being faster, particularly when the CLAST[AB] op is a loop-carried
1242 // dependency.
1243 Value *Pg = II.getArgOperand(0);
1244 Value *Fallback = II.getArgOperand(1);
1245 Value *Vec = II.getArgOperand(2);
1246 Type *Ty = II.getType();
1247
1248 if (!Ty->isIntegerTy())
1249 return std::nullopt;
1250
1251 Type *FPTy;
1252 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1253 default:
1254 return std::nullopt;
1255 case 16:
1256 FPTy = IC.Builder.getHalfTy();
1257 break;
1258 case 32:
1259 FPTy = IC.Builder.getFloatTy();
1260 break;
1261 case 64:
1262 FPTy = IC.Builder.getDoubleTy();
1263 break;
1264 }
1265
1266 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1267 auto *FPVTy = VectorType::get(
1268 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1269 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1270 auto *FPII = IC.Builder.CreateIntrinsic(
1271 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1272 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1273 return IC.replaceInstUsesWith(II, FPIItoInt);
1274}
1275
1276static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1277 IntrinsicInst &II) {
1278 LLVMContext &Ctx = II.getContext();
1279 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1280 // can work with RDFFR_PP for ptest elimination.
1281 auto *AllPat =
1282 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1283 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1284 {II.getType()}, {AllPat});
1285 auto *RDFFR =
1286 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1287 RDFFR->takeName(&II);
1288 return IC.replaceInstUsesWith(II, RDFFR);
1289}
1290
1291static std::optional<Instruction *>
1293 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1294
1295 if (Pattern == AArch64SVEPredPattern::all) {
1296 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1297 auto *VScale = IC.Builder.CreateVScale(StepVal);
1298 VScale->takeName(&II);
1299 return IC.replaceInstUsesWith(II, VScale);
1300 }
1301
1302 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1303
1304 return MinNumElts && NumElts >= MinNumElts
1305 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1306 II, ConstantInt::get(II.getType(), MinNumElts)))
1307 : std::nullopt;
1308}
1309
1310static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1311 IntrinsicInst &II) {
1312 Value *PgVal = II.getArgOperand(0);
1313 Value *OpVal = II.getArgOperand(1);
1314
1315 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1316 // Later optimizations prefer this form.
1317 if (PgVal == OpVal &&
1318 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1319 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1320 Value *Ops[] = {PgVal, OpVal};
1321 Type *Tys[] = {PgVal->getType()};
1322
1323 auto *PTest =
1324 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1325 PTest->takeName(&II);
1326
1327 return IC.replaceInstUsesWith(II, PTest);
1328 }
1329
1330 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1331 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1332
1333 if (!Pg || !Op)
1334 return std::nullopt;
1335
1336 Intrinsic::ID OpIID = Op->getIntrinsicID();
1337
1338 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1339 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1340 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1341 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1342 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1343
1344 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1345
1346 PTest->takeName(&II);
1347 return IC.replaceInstUsesWith(II, PTest);
1348 }
1349
1350 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1351 // Later optimizations may rewrite sequence to use the flag-setting variant
1352 // of instruction X to remove PTEST.
1353 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1354 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1355 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1356 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1357 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1358 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1359 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1360 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1361 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1362 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1363 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1364 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1365 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1366 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1367 Type *Tys[] = {Pg->getType()};
1368
1369 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1370 PTest->takeName(&II);
1371
1372 return IC.replaceInstUsesWith(II, PTest);
1373 }
1374
1375 return std::nullopt;
1376}
1377
1378template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1379static std::optional<Instruction *>
1381 bool MergeIntoAddendOp) {
1382 Value *P = II.getOperand(0);
1383 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1384 if (MergeIntoAddendOp) {
1385 AddendOp = II.getOperand(1);
1386 Mul = II.getOperand(2);
1387 } else {
1388 AddendOp = II.getOperand(2);
1389 Mul = II.getOperand(1);
1390 }
1391
1392 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1393 m_Value(MulOp1))))
1394 return std::nullopt;
1395
1396 if (!Mul->hasOneUse())
1397 return std::nullopt;
1398
1399 Instruction *FMFSource = nullptr;
1400 if (II.getType()->isFPOrFPVectorTy()) {
1401 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1402 // Stop the combine when the flags on the inputs differ in case dropping
1403 // flags would lead to us missing out on more beneficial optimizations.
1404 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1405 return std::nullopt;
1406 if (!FAddFlags.allowContract())
1407 return std::nullopt;
1408 FMFSource = &II;
1409 }
1410
1411 CallInst *Res;
1412 if (MergeIntoAddendOp)
1413 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1414 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1415 else
1416 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1417 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1418
1419 return IC.replaceInstUsesWith(II, Res);
1420}
1421
1422static std::optional<Instruction *>
1424 Value *Pred = II.getOperand(0);
1425 Value *PtrOp = II.getOperand(1);
1426 Type *VecTy = II.getType();
1427
1428 // Replace by zero constant when all lanes are inactive
1429 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1430 return II_NA;
1431
1432 if (isAllActivePredicate(Pred)) {
1433 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1434 Load->copyMetadata(II);
1435 return IC.replaceInstUsesWith(II, Load);
1436 }
1437
1438 CallInst *MaskedLoad =
1439 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1440 Pred, ConstantAggregateZero::get(VecTy));
1441 MaskedLoad->copyMetadata(II);
1442 return IC.replaceInstUsesWith(II, MaskedLoad);
1443}
1444
1445static std::optional<Instruction *>
1447 Value *VecOp = II.getOperand(0);
1448 Value *Pred = II.getOperand(1);
1449 Value *PtrOp = II.getOperand(2);
1450
1451 if (isAllActivePredicate(Pred)) {
1452 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1453 Store->copyMetadata(II);
1454 return IC.eraseInstFromFunction(II);
1455 }
1456
1457 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1458 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1459 MaskedStore->copyMetadata(II);
1460 return IC.eraseInstFromFunction(II);
1461}
1462
1464 switch (Intrinsic) {
1465 case Intrinsic::aarch64_sve_fmul_u:
1466 return Instruction::BinaryOps::FMul;
1467 case Intrinsic::aarch64_sve_fadd_u:
1468 return Instruction::BinaryOps::FAdd;
1469 case Intrinsic::aarch64_sve_fsub_u:
1470 return Instruction::BinaryOps::FSub;
1471 default:
1472 return Instruction::BinaryOpsEnd;
1473 }
1474}
1475
1476static std::optional<Instruction *>
1478 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1479 if (II.isStrictFP())
1480 return std::nullopt;
1481
1482 auto *OpPredicate = II.getOperand(0);
1483 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1484 if (BinOpCode == Instruction::BinaryOpsEnd ||
1485 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1486 m_ConstantInt<AArch64SVEPredPattern::all>())))
1487 return std::nullopt;
1489 IC.Builder.setFastMathFlags(II.getFastMathFlags());
1490 auto BinOp =
1491 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1492 return IC.replaceInstUsesWith(II, BinOp);
1493}
1494
1495// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1496// sve.add_u).
1497static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1498 Intrinsic::ID IID) {
1499 auto *OpPredicate = II.getOperand(0);
1500 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1501 m_ConstantInt<AArch64SVEPredPattern::all>())))
1502 return std::nullopt;
1503
1504 auto *Mod = II.getModule();
1505 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1506 II.setCalledFunction(NewDecl);
1507
1508 return &II;
1509}
1510
1511// Simplify operations where predicate has all inactive lanes or try to replace
1512// with _u form when all lanes are active
1513static std::optional<Instruction *>
1515 Intrinsic::ID IID) {
1516 if (match(II.getOperand(0), m_ZeroInt())) {
1517 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1518 // inactive for sv[func]_m
1519 return IC.replaceInstUsesWith(II, II.getOperand(1));
1520 }
1521 return instCombineSVEAllActive(II, IID);
1522}
1523
1524static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1525 IntrinsicInst &II) {
1526 if (auto II_U =
1527 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1528 return II_U;
1529 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1530 Intrinsic::aarch64_sve_mla>(
1531 IC, II, true))
1532 return MLA;
1533 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1534 Intrinsic::aarch64_sve_mad>(
1535 IC, II, false))
1536 return MAD;
1537 return std::nullopt;
1538}
1539
1540static std::optional<Instruction *>
1542 if (auto II_U =
1543 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1544 return II_U;
1545 if (auto FMLA =
1546 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1547 Intrinsic::aarch64_sve_fmla>(IC, II,
1548 true))
1549 return FMLA;
1550 if (auto FMAD =
1551 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1552 Intrinsic::aarch64_sve_fmad>(IC, II,
1553 false))
1554 return FMAD;
1555 if (auto FMLA =
1556 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1557 Intrinsic::aarch64_sve_fmla>(IC, II,
1558 true))
1559 return FMLA;
1560 return std::nullopt;
1561}
1562
1563static std::optional<Instruction *>
1565 if (auto FMLA =
1566 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1567 Intrinsic::aarch64_sve_fmla>(IC, II,
1568 true))
1569 return FMLA;
1570 if (auto FMAD =
1571 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1572 Intrinsic::aarch64_sve_fmad>(IC, II,
1573 false))
1574 return FMAD;
1575 if (auto FMLA_U =
1576 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1577 Intrinsic::aarch64_sve_fmla_u>(
1578 IC, II, true))
1579 return FMLA_U;
1580 return instCombineSVEVectorBinOp(IC, II);
1581}
1582
1583static std::optional<Instruction *>
1585 if (auto II_U =
1586 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1587 return II_U;
1588 if (auto FMLS =
1589 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1590 Intrinsic::aarch64_sve_fmls>(IC, II,
1591 true))
1592 return FMLS;
1593 if (auto FMSB =
1594 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1595 Intrinsic::aarch64_sve_fnmsb>(
1596 IC, II, false))
1597 return FMSB;
1598 if (auto FMLS =
1599 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1600 Intrinsic::aarch64_sve_fmls>(IC, II,
1601 true))
1602 return FMLS;
1603 return std::nullopt;
1604}
1605
1606static std::optional<Instruction *>
1608 if (auto FMLS =
1609 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1610 Intrinsic::aarch64_sve_fmls>(IC, II,
1611 true))
1612 return FMLS;
1613 if (auto FMSB =
1614 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1615 Intrinsic::aarch64_sve_fnmsb>(
1616 IC, II, false))
1617 return FMSB;
1618 if (auto FMLS_U =
1619 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1620 Intrinsic::aarch64_sve_fmls_u>(
1621 IC, II, true))
1622 return FMLS_U;
1623 return instCombineSVEVectorBinOp(IC, II);
1624}
1625
1626static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1627 IntrinsicInst &II) {
1628 if (auto II_U =
1629 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1630 return II_U;
1631 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1632 Intrinsic::aarch64_sve_mls>(
1633 IC, II, true))
1634 return MLS;
1635 return std::nullopt;
1636}
1637
1638static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1640 Intrinsic::ID IID) {
1641 auto *OpPredicate = II.getOperand(0);
1642 auto *OpMultiplicand = II.getOperand(1);
1643 auto *OpMultiplier = II.getOperand(2);
1644
1645 // Return true if a given instruction is a unit splat value, false otherwise.
1646 auto IsUnitSplat = [](auto *I) {
1647 auto *SplatValue = getSplatValue(I);
1648 if (!SplatValue)
1649 return false;
1650 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1651 };
1652
1653 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1654 // with a unit splat value, false otherwise.
1655 auto IsUnitDup = [](auto *I) {
1656 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1657 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1658 return false;
1659
1660 auto *SplatValue = IntrI->getOperand(2);
1661 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1662 };
1663
1664 if (IsUnitSplat(OpMultiplier)) {
1665 // [f]mul pg %n, (dupx 1) => %n
1666 OpMultiplicand->takeName(&II);
1667 return IC.replaceInstUsesWith(II, OpMultiplicand);
1668 } else if (IsUnitDup(OpMultiplier)) {
1669 // [f]mul pg %n, (dup pg 1) => %n
1670 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1671 auto *DupPg = DupInst->getOperand(1);
1672 // TODO: this is naive. The optimization is still valid if DupPg
1673 // 'encompasses' OpPredicate, not only if they're the same predicate.
1674 if (OpPredicate == DupPg) {
1675 OpMultiplicand->takeName(&II);
1676 return IC.replaceInstUsesWith(II, OpMultiplicand);
1677 }
1678 }
1679
1680 return instCombineSVEVectorBinOp(IC, II);
1681}
1682
1683static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1684 IntrinsicInst &II) {
1685 Value *UnpackArg = II.getArgOperand(0);
1686 auto *RetTy = cast<ScalableVectorType>(II.getType());
1687 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1688 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1689
1690 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1691 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1692 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1693 ScalarArg =
1694 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1695 Value *NewVal =
1696 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1697 NewVal->takeName(&II);
1698 return IC.replaceInstUsesWith(II, NewVal);
1699 }
1700
1701 return std::nullopt;
1702}
1703static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1704 IntrinsicInst &II) {
1705 auto *OpVal = II.getOperand(0);
1706 auto *OpIndices = II.getOperand(1);
1707 VectorType *VTy = cast<VectorType>(II.getType());
1708
1709 // Check whether OpIndices is a constant splat value < minimal element count
1710 // of result.
1711 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1712 if (!SplatValue ||
1713 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1714 return std::nullopt;
1715
1716 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1717 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1718 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1719 auto *VectorSplat =
1720 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1721
1722 VectorSplat->takeName(&II);
1723 return IC.replaceInstUsesWith(II, VectorSplat);
1724}
1725
1726static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1727 IntrinsicInst &II) {
1728 Value *A, *B;
1729 Type *RetTy = II.getType();
1730 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1731 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1732
1733 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1734 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1735 if ((match(II.getArgOperand(0),
1736 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1737 match(II.getArgOperand(1),
1738 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1739 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1740 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1741 auto *TyA = cast<ScalableVectorType>(A->getType());
1742 if (TyA == B->getType() &&
1744 auto *SubVec = IC.Builder.CreateInsertVector(
1746 auto *ConcatVec = IC.Builder.CreateInsertVector(
1747 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1748 ConcatVec->takeName(&II);
1749 return IC.replaceInstUsesWith(II, ConcatVec);
1750 }
1751 }
1752
1753 return std::nullopt;
1754}
1755
1756static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1757 IntrinsicInst &II) {
1758 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1759 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1760 Value *A, *B;
1761 if (match(II.getArgOperand(0),
1762 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1763 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1764 m_Specific(A), m_Specific(B))))
1765 return IC.replaceInstUsesWith(
1766 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1767
1768 return std::nullopt;
1769}
1770
1771static std::optional<Instruction *>
1773 Value *Mask = II.getOperand(0);
1774 Value *BasePtr = II.getOperand(1);
1775 Value *Index = II.getOperand(2);
1776 Type *Ty = II.getType();
1777 Value *PassThru = ConstantAggregateZero::get(Ty);
1778
1779 // Replace by zero constant when all lanes are inactive
1780 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1781 return II_NA;
1782
1783 // Contiguous gather => masked load.
1784 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1785 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1786 Value *IndexBase;
1787 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1788 m_Value(IndexBase), m_SpecificInt(1)))) {
1789 Align Alignment =
1790 BasePtr->getPointerAlignment(II.getDataLayout());
1791
1792 Type *VecPtrTy = PointerType::getUnqual(Ty);
1793 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1794 BasePtr, IndexBase);
1795 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1796 CallInst *MaskedLoad =
1797 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1798 MaskedLoad->takeName(&II);
1799 return IC.replaceInstUsesWith(II, MaskedLoad);
1800 }
1801
1802 return std::nullopt;
1803}
1804
1805static std::optional<Instruction *>
1807 Value *Val = II.getOperand(0);
1808 Value *Mask = II.getOperand(1);
1809 Value *BasePtr = II.getOperand(2);
1810 Value *Index = II.getOperand(3);
1811 Type *Ty = Val->getType();
1812
1813 // Contiguous scatter => masked store.
1814 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1815 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1816 Value *IndexBase;
1817 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1818 m_Value(IndexBase), m_SpecificInt(1)))) {
1819 Align Alignment =
1820 BasePtr->getPointerAlignment(II.getDataLayout());
1821
1822 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1823 BasePtr, IndexBase);
1824 Type *VecPtrTy = PointerType::getUnqual(Ty);
1825 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1826
1827 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1828
1829 return IC.eraseInstFromFunction(II);
1830 }
1831
1832 return std::nullopt;
1833}
1834
1835static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1836 IntrinsicInst &II) {
1837 Type *Int32Ty = IC.Builder.getInt32Ty();
1838 Value *Pred = II.getOperand(0);
1839 Value *Vec = II.getOperand(1);
1840 Value *DivVec = II.getOperand(2);
1841
1842 Value *SplatValue = getSplatValue(DivVec);
1843 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1844 if (!SplatConstantInt)
1845 return std::nullopt;
1846 APInt Divisor = SplatConstantInt->getValue();
1847
1848 if (Divisor.isPowerOf2()) {
1849 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1850 auto ASRD = IC.Builder.CreateIntrinsic(
1851 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1852 return IC.replaceInstUsesWith(II, ASRD);
1853 }
1854 if (Divisor.isNegatedPowerOf2()) {
1855 Divisor.negate();
1856 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1857 auto ASRD = IC.Builder.CreateIntrinsic(
1858 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1859 auto NEG = IC.Builder.CreateIntrinsic(
1860 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1861 return IC.replaceInstUsesWith(II, NEG);
1862 }
1863
1864 return std::nullopt;
1865}
1866
1867bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1868 size_t VecSize = Vec.size();
1869 if (VecSize == 1)
1870 return true;
1871 if (!isPowerOf2_64(VecSize))
1872 return false;
1873 size_t HalfVecSize = VecSize / 2;
1874
1875 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1876 RHS != Vec.end(); LHS++, RHS++) {
1877 if (*LHS != nullptr && *RHS != nullptr) {
1878 if (*LHS == *RHS)
1879 continue;
1880 else
1881 return false;
1882 }
1883 if (!AllowPoison)
1884 return false;
1885 if (*LHS == nullptr && *RHS != nullptr)
1886 *LHS = *RHS;
1887 }
1888
1889 Vec.resize(HalfVecSize);
1890 SimplifyValuePattern(Vec, AllowPoison);
1891 return true;
1892}
1893
1894// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1895// to dupqlane(f64(C)) where C is A concatenated with B
1896static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1897 IntrinsicInst &II) {
1898 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1899 if (!match(II.getOperand(0),
1900 m_Intrinsic<Intrinsic::vector_insert>(
1901 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1902 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1903 return std::nullopt;
1904 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1905
1906 // Insert the scalars into a container ordered by InsertElement index
1907 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1908 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1909 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1910 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1911 CurrentInsertElt = InsertElt->getOperand(0);
1912 }
1913
1914 bool AllowPoison =
1915 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1916 if (!SimplifyValuePattern(Elts, AllowPoison))
1917 return std::nullopt;
1918
1919 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1920 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1921 for (size_t I = 0; I < Elts.size(); I++) {
1922 if (Elts[I] == nullptr)
1923 continue;
1924 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1925 IC.Builder.getInt64(I));
1926 }
1927 if (InsertEltChain == nullptr)
1928 return std::nullopt;
1929
1930 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1931 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1932 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1933 // be narrowed back to the original type.
1934 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1935 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1936 IIScalableTy->getMinNumElements() /
1937 PatternWidth;
1938
1939 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1940 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1941 auto *WideShuffleMaskTy =
1942 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1943
1944 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1945 auto InsertSubvector = IC.Builder.CreateInsertVector(
1946 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1947 auto WideBitcast =
1948 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1949 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1950 auto WideShuffle = IC.Builder.CreateShuffleVector(
1951 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1952 auto NarrowBitcast =
1953 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1954
1955 return IC.replaceInstUsesWith(II, NarrowBitcast);
1956}
1957
1958static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1959 IntrinsicInst &II) {
1960 Value *A = II.getArgOperand(0);
1961 Value *B = II.getArgOperand(1);
1962 if (A == B)
1963 return IC.replaceInstUsesWith(II, A);
1964
1965 return std::nullopt;
1966}
1967
1968static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1969 IntrinsicInst &II) {
1970 Value *Pred = II.getOperand(0);
1971 Value *Vec = II.getOperand(1);
1972 Value *Shift = II.getOperand(2);
1973
1974 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1975 Value *AbsPred, *MergedValue;
1976 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1977 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1978 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1979 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1980
1981 return std::nullopt;
1982
1983 // Transform is valid if any of the following are true:
1984 // * The ABS merge value is an undef or non-negative
1985 // * The ABS predicate is all active
1986 // * The ABS predicate and the SRSHL predicates are the same
1987 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1988 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1989 return std::nullopt;
1990
1991 // Only valid when the shift amount is non-negative, otherwise the rounding
1992 // behaviour of SRSHL cannot be ignored.
1993 if (!match(Shift, m_NonNegative()))
1994 return std::nullopt;
1995
1996 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1997 {II.getType()}, {Pred, Vec, Shift});
1998
1999 return IC.replaceInstUsesWith(II, LSL);
2000}
2001
2002std::optional<Instruction *>
2004 IntrinsicInst &II) const {
2005 Intrinsic::ID IID = II.getIntrinsicID();
2006 switch (IID) {
2007 default:
2008 break;
2009
2010 case Intrinsic::aarch64_sve_ld1_gather:
2011 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2012 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2013 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2014 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2015 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2016 case Intrinsic::aarch64_sve_ld1q_gather_index:
2017 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2018 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2019 case Intrinsic::aarch64_sve_ld1ro:
2020 case Intrinsic::aarch64_sve_ld1rq:
2021 case Intrinsic::aarch64_sve_ld1udq:
2022 case Intrinsic::aarch64_sve_ld1uwq:
2023 case Intrinsic::aarch64_sve_ld2_sret:
2024 case Intrinsic::aarch64_sve_ld2q_sret:
2025 case Intrinsic::aarch64_sve_ld3_sret:
2026 case Intrinsic::aarch64_sve_ld3q_sret:
2027 case Intrinsic::aarch64_sve_ld4_sret:
2028 case Intrinsic::aarch64_sve_ld4q_sret:
2029 case Intrinsic::aarch64_sve_ldff1:
2030 case Intrinsic::aarch64_sve_ldff1_gather:
2031 case Intrinsic::aarch64_sve_ldff1_gather_index:
2032 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2033 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2034 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2035 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2036 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2037 case Intrinsic::aarch64_sve_ldnf1:
2038 case Intrinsic::aarch64_sve_ldnt1:
2039 case Intrinsic::aarch64_sve_ldnt1_gather:
2040 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2041 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2042 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2044 case Intrinsic::aarch64_neon_fmaxnm:
2045 case Intrinsic::aarch64_neon_fminnm:
2046 return instCombineMaxMinNM(IC, II);
2047 case Intrinsic::aarch64_sve_convert_from_svbool:
2048 return instCombineConvertFromSVBool(IC, II);
2049 case Intrinsic::aarch64_sve_dup:
2050 return instCombineSVEDup(IC, II);
2051 case Intrinsic::aarch64_sve_dup_x:
2052 return instCombineSVEDupX(IC, II);
2053 case Intrinsic::aarch64_sve_cmpne:
2054 case Intrinsic::aarch64_sve_cmpne_wide:
2055 return instCombineSVECmpNE(IC, II);
2056 case Intrinsic::aarch64_sve_rdffr:
2057 return instCombineRDFFR(IC, II);
2058 case Intrinsic::aarch64_sve_lasta:
2059 case Intrinsic::aarch64_sve_lastb:
2060 return instCombineSVELast(IC, II);
2061 case Intrinsic::aarch64_sve_clasta_n:
2062 case Intrinsic::aarch64_sve_clastb_n:
2063 return instCombineSVECondLast(IC, II);
2064 case Intrinsic::aarch64_sve_cntd:
2065 return instCombineSVECntElts(IC, II, 2);
2066 case Intrinsic::aarch64_sve_cntw:
2067 return instCombineSVECntElts(IC, II, 4);
2068 case Intrinsic::aarch64_sve_cnth:
2069 return instCombineSVECntElts(IC, II, 8);
2070 case Intrinsic::aarch64_sve_cntb:
2071 return instCombineSVECntElts(IC, II, 16);
2072 case Intrinsic::aarch64_sve_ptest_any:
2073 case Intrinsic::aarch64_sve_ptest_first:
2074 case Intrinsic::aarch64_sve_ptest_last:
2075 return instCombineSVEPTest(IC, II);
2076 case Intrinsic::aarch64_sve_fabd:
2077 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2078 case Intrinsic::aarch64_sve_fadd:
2079 return instCombineSVEVectorFAdd(IC, II);
2080 case Intrinsic::aarch64_sve_fadd_u:
2081 return instCombineSVEVectorFAddU(IC, II);
2082 case Intrinsic::aarch64_sve_fdiv:
2083 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2084 case Intrinsic::aarch64_sve_fmax:
2085 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2086 case Intrinsic::aarch64_sve_fmaxnm:
2087 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2088 case Intrinsic::aarch64_sve_fmin:
2089 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2090 case Intrinsic::aarch64_sve_fminnm:
2091 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2092 case Intrinsic::aarch64_sve_fmla:
2093 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2094 case Intrinsic::aarch64_sve_fmls:
2095 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2096 case Intrinsic::aarch64_sve_fmul:
2097 if (auto II_U =
2098 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2099 return II_U;
2100 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2101 case Intrinsic::aarch64_sve_fmul_u:
2102 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2103 case Intrinsic::aarch64_sve_fmulx:
2104 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2105 case Intrinsic::aarch64_sve_fnmla:
2106 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2107 case Intrinsic::aarch64_sve_fnmls:
2108 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2109 case Intrinsic::aarch64_sve_fsub:
2110 return instCombineSVEVectorFSub(IC, II);
2111 case Intrinsic::aarch64_sve_fsub_u:
2112 return instCombineSVEVectorFSubU(IC, II);
2113 case Intrinsic::aarch64_sve_add:
2114 return instCombineSVEVectorAdd(IC, II);
2115 case Intrinsic::aarch64_sve_add_u:
2116 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2117 Intrinsic::aarch64_sve_mla_u>(
2118 IC, II, true);
2119 case Intrinsic::aarch64_sve_mla:
2120 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2121 case Intrinsic::aarch64_sve_mls:
2122 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2123 case Intrinsic::aarch64_sve_mul:
2124 if (auto II_U =
2125 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2126 return II_U;
2127 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2128 case Intrinsic::aarch64_sve_mul_u:
2129 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2130 case Intrinsic::aarch64_sve_sabd:
2131 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2132 case Intrinsic::aarch64_sve_smax:
2133 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2134 case Intrinsic::aarch64_sve_smin:
2135 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2136 case Intrinsic::aarch64_sve_smulh:
2137 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2138 case Intrinsic::aarch64_sve_sub:
2139 return instCombineSVEVectorSub(IC, II);
2140 case Intrinsic::aarch64_sve_sub_u:
2141 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2142 Intrinsic::aarch64_sve_mls_u>(
2143 IC, II, true);
2144 case Intrinsic::aarch64_sve_uabd:
2145 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2146 case Intrinsic::aarch64_sve_umax:
2147 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2148 case Intrinsic::aarch64_sve_umin:
2149 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2150 case Intrinsic::aarch64_sve_umulh:
2151 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2152 case Intrinsic::aarch64_sve_asr:
2153 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2154 case Intrinsic::aarch64_sve_lsl:
2155 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2156 case Intrinsic::aarch64_sve_lsr:
2157 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2158 case Intrinsic::aarch64_sve_and:
2159 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2160 case Intrinsic::aarch64_sve_bic:
2161 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2162 case Intrinsic::aarch64_sve_eor:
2163 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2164 case Intrinsic::aarch64_sve_orr:
2165 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2166 case Intrinsic::aarch64_sve_sqsub:
2167 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2168 case Intrinsic::aarch64_sve_uqsub:
2169 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2170 case Intrinsic::aarch64_sve_tbl:
2171 return instCombineSVETBL(IC, II);
2172 case Intrinsic::aarch64_sve_uunpkhi:
2173 case Intrinsic::aarch64_sve_uunpklo:
2174 case Intrinsic::aarch64_sve_sunpkhi:
2175 case Intrinsic::aarch64_sve_sunpklo:
2176 return instCombineSVEUnpack(IC, II);
2177 case Intrinsic::aarch64_sve_uzp1:
2178 return instCombineSVEUzp1(IC, II);
2179 case Intrinsic::aarch64_sve_zip1:
2180 case Intrinsic::aarch64_sve_zip2:
2181 return instCombineSVEZip(IC, II);
2182 case Intrinsic::aarch64_sve_ld1_gather_index:
2183 return instCombineLD1GatherIndex(IC, II);
2184 case Intrinsic::aarch64_sve_st1_scatter_index:
2185 return instCombineST1ScatterIndex(IC, II);
2186 case Intrinsic::aarch64_sve_ld1:
2187 return instCombineSVELD1(IC, II, DL);
2188 case Intrinsic::aarch64_sve_st1:
2189 return instCombineSVEST1(IC, II, DL);
2190 case Intrinsic::aarch64_sve_sdiv:
2191 return instCombineSVESDIV(IC, II);
2192 case Intrinsic::aarch64_sve_sel:
2193 return instCombineSVESel(IC, II);
2194 case Intrinsic::aarch64_sve_srshl:
2195 return instCombineSVESrshl(IC, II);
2196 case Intrinsic::aarch64_sve_dupq_lane:
2197 return instCombineSVEDupqLane(IC, II);
2198 }
2199
2200 return std::nullopt;
2201}
2202
2204 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2205 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2206 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2207 SimplifyAndSetOp) const {
2208 switch (II.getIntrinsicID()) {
2209 default:
2210 break;
2211 case Intrinsic::aarch64_neon_fcvtxn:
2212 case Intrinsic::aarch64_neon_rshrn:
2213 case Intrinsic::aarch64_neon_sqrshrn:
2214 case Intrinsic::aarch64_neon_sqrshrun:
2215 case Intrinsic::aarch64_neon_sqshrn:
2216 case Intrinsic::aarch64_neon_sqshrun:
2217 case Intrinsic::aarch64_neon_sqxtn:
2218 case Intrinsic::aarch64_neon_sqxtun:
2219 case Intrinsic::aarch64_neon_uqrshrn:
2220 case Intrinsic::aarch64_neon_uqshrn:
2221 case Intrinsic::aarch64_neon_uqxtn:
2222 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2223 break;
2224 }
2225
2226 return std::nullopt;
2227}
2228
2231 switch (K) {
2233 return TypeSize::getFixed(64);
2235 if (ST->useSVEForFixedLengthVectors() &&
2237 return TypeSize::getFixed(
2238 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2239 else if (ST->isNeonAvailable())
2240 return TypeSize::getFixed(128);
2241 else
2242 return TypeSize::getFixed(0);
2244 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2246 return TypeSize::getScalable(128);
2247 else
2248 return TypeSize::getScalable(0);
2249 }
2250 llvm_unreachable("Unsupported register kind");
2251}
2252
2253bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2255 Type *SrcOverrideTy) {
2256 // A helper that returns a vector type from the given type. The number of
2257 // elements in type Ty determines the vector width.
2258 auto toVectorTy = [&](Type *ArgTy) {
2259 return VectorType::get(ArgTy->getScalarType(),
2260 cast<VectorType>(DstTy)->getElementCount());
2261 };
2262
2263 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2264 // i32, i64]. SVE doesn't generally have the same set of instructions to
2265 // perform an extend with the add/sub/mul. There are SMULLB style
2266 // instructions, but they operate on top/bottom, requiring some sort of lane
2267 // interleaving to be used with zext/sext.
2268 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2269 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2270 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2271 return false;
2272
2273 // Determine if the operation has a widening variant. We consider both the
2274 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2275 // instructions.
2276 //
2277 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2278 // verify that their extending operands are eliminated during code
2279 // generation.
2280 Type *SrcTy = SrcOverrideTy;
2281 switch (Opcode) {
2282 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2283 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2284 // The second operand needs to be an extend
2285 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2286 if (!SrcTy)
2287 SrcTy =
2288 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2289 } else
2290 return false;
2291 break;
2292 case Instruction::Mul: { // SMULL(2), UMULL(2)
2293 // Both operands need to be extends of the same type.
2294 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2295 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2296 if (!SrcTy)
2297 SrcTy =
2298 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2299 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2300 // If one of the operands is a Zext and the other has enough zero bits to
2301 // be treated as unsigned, we can still general a umull, meaning the zext
2302 // is free.
2303 KnownBits Known =
2304 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2305 if (Args[0]->getType()->getScalarSizeInBits() -
2306 Known.Zero.countLeadingOnes() >
2307 DstTy->getScalarSizeInBits() / 2)
2308 return false;
2309 if (!SrcTy)
2310 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2311 DstTy->getScalarSizeInBits() / 2));
2312 } else
2313 return false;
2314 break;
2315 }
2316 default:
2317 return false;
2318 }
2319
2320 // Legalize the destination type and ensure it can be used in a widening
2321 // operation.
2322 auto DstTyL = getTypeLegalizationCost(DstTy);
2323 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2324 return false;
2325
2326 // Legalize the source type and ensure it can be used in a widening
2327 // operation.
2328 assert(SrcTy && "Expected some SrcTy");
2329 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2330 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2331 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2332 return false;
2333
2334 // Get the total number of vector elements in the legalized types.
2335 InstructionCost NumDstEls =
2336 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2337 InstructionCost NumSrcEls =
2338 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2339
2340 // Return true if the legalized types have the same number of vector elements
2341 // and the destination element type size is twice that of the source type.
2342 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2343}
2344
2345// s/urhadd instructions implement the following pattern, making the
2346// extends free:
2347// %x = add ((zext i8 -> i16), 1)
2348// %y = (zext i8 -> i16)
2349// trunc i16 (lshr (add %x, %y), 1) -> i8
2350//
2352 Type *Src) {
2353 // The source should be a legal vector type.
2354 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2355 (Src->isScalableTy() && !ST->hasSVE2()))
2356 return false;
2357
2358 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2359 return false;
2360
2361 // Look for trunc/shl/add before trying to match the pattern.
2362 const Instruction *Add = ExtUser;
2363 auto *AddUser =
2364 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2365 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2366 Add = AddUser;
2367
2368 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2369 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2370 return false;
2371
2372 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2373 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2374 Src->getScalarSizeInBits() !=
2375 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2376 return false;
2377
2378 // Try to match the whole pattern. Ext could be either the first or second
2379 // m_ZExtOrSExt matched.
2380 Instruction *Ex1, *Ex2;
2381 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2382 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2383 return false;
2384
2385 // Ensure both extends are of the same type
2386 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2387 Ex1->getOpcode() == Ex2->getOpcode())
2388 return true;
2389
2390 return false;
2391}
2392
2394 Type *Src,
2397 const Instruction *I) {
2398 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2399 assert(ISD && "Invalid opcode");
2400 // If the cast is observable, and it is used by a widening instruction (e.g.,
2401 // uaddl, saddw, etc.), it may be free.
2402 if (I && I->hasOneUser()) {
2403 auto *SingleUser = cast<Instruction>(*I->user_begin());
2404 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2405 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2406 // For adds only count the second operand as free if both operands are
2407 // extends but not the same operation. (i.e both operands are not free in
2408 // add(sext, zext)).
2409 if (SingleUser->getOpcode() == Instruction::Add) {
2410 if (I == SingleUser->getOperand(1) ||
2411 (isa<CastInst>(SingleUser->getOperand(1)) &&
2412 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2413 return 0;
2414 } else // Others are free so long as isWideningInstruction returned true.
2415 return 0;
2416 }
2417
2418 // The cast will be free for the s/urhadd instructions
2419 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2420 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2421 return 0;
2422 }
2423
2424 // TODO: Allow non-throughput costs that aren't binary.
2425 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2427 return Cost == 0 ? 0 : 1;
2428 return Cost;
2429 };
2430
2431 EVT SrcTy = TLI->getValueType(DL, Src);
2432 EVT DstTy = TLI->getValueType(DL, Dst);
2433
2434 if (!SrcTy.isSimple() || !DstTy.isSimple())
2435 return AdjustCost(
2436 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2437
2438 static const TypeConversionCostTblEntry
2439 ConversionTbl[] = {
2440 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2441 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2442 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2443 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2444 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2445 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2446 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2447 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2448 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2449 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2450 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2451 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2452 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2453 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2454 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2455 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2456 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2457 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2458 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2459 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2460
2461 // Truncations on nxvmiN
2462 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2463 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2464 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2465 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2466 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2467 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2468 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2469 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2470 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2471 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2472 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2473 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2474 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2475 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2476 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2477 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2478
2479 // The number of shll instructions for the extension.
2480 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2481 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2482 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2483 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2484 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2485 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2486 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2487 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2488 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2489 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2490 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2491 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2492 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2493 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2494 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2495 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2496
2497 // LowerVectorINT_TO_FP:
2498 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2499 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2500 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2501 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2502 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2503 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2504
2505 // Complex: to v2f32
2506 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2507 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2508 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2509 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2510 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2511 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2512
2513 // Complex: to v4f32
2514 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2515 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2516 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2517 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2518
2519 // Complex: to v8f32
2520 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2521 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2522 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2523 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2524
2525 // Complex: to v16f32
2526 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2527 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2528
2529 // Complex: to v2f64
2530 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2531 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2532 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2533 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2534 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2535 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2536
2537 // Complex: to v4f64
2538 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2539 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2540
2541 // LowerVectorFP_TO_INT
2542 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2543 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2544 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2545 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2546 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2547 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2548
2549 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2550 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2551 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2552 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2553 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2554 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2555 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2556
2557 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2558 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2559 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2560 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2561 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2562
2563 // Complex, from nxv2f32.
2564 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2565 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2566 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2567 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2568 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2569 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2570 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2571 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2572
2573 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2574 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2575 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2576 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2577 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2578 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2579 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2580
2581 // Complex, from nxv2f64.
2582 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2583 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2584 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2585 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2586 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2587 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2588 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2589 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2590
2591 // Complex, from nxv4f32.
2592 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2593 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2594 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2595 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2596 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2597 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2598 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2599 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2600
2601 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2602 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2603 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2604 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2605 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2606
2607 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2608 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2609 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2610 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2611 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2612 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2613 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2614
2615 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2616 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2617 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2618 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2619 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2620
2621 // Complex, from nxv8f16.
2622 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2623 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2624 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2625 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2626 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2627 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2628 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2629 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2630
2631 // Complex, from nxv4f16.
2632 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2633 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2634 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2635 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2636 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2637 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2638 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2639 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2640
2641 // Complex, from nxv2f16.
2642 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2643 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2644 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2645 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2646 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2647 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2648 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2649 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2650
2651 // Truncate from nxvmf32 to nxvmf16.
2652 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2653 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2654 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2655
2656 // Truncate from nxvmf64 to nxvmf16.
2657 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2658 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2659 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2660
2661 // Truncate from nxvmf64 to nxvmf32.
2662 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2663 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2664 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2665
2666 // Extend from nxvmf16 to nxvmf32.
2667 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2668 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2669 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2670
2671 // Extend from nxvmf16 to nxvmf64.
2672 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2673 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2674 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2675
2676 // Extend from nxvmf32 to nxvmf64.
2677 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2678 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2679 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2680
2681 // Bitcasts from float to integer
2682 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2683 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2684 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2685
2686 // Bitcasts from integer to float
2687 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2688 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2689 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2690
2691 // Add cost for extending to illegal -too wide- scalable vectors.
2692 // zero/sign extend are implemented by multiple unpack operations,
2693 // where each operation has a cost of 1.
2694 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2695 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2696 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2697 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2698 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2699 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2700
2701 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2702 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2703 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2704 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2705 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2706 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2707 };
2708
2709 // We have to estimate a cost of fixed length operation upon
2710 // SVE registers(operations) with the number of registers required
2711 // for a fixed type to be represented upon SVE registers.
2712 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2713 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2714 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2715 ST->useSVEForFixedLengthVectors(WiderTy)) {
2716 std::pair<InstructionCost, MVT> LT =
2717 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2718 unsigned NumElements = AArch64::SVEBitsPerBlock /
2719 LT.second.getVectorElementType().getSizeInBits();
2720 return AdjustCost(
2721 LT.first *
2723 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2724 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2725 CostKind, I));
2726 }
2727
2728 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2729 DstTy.getSimpleVT(),
2730 SrcTy.getSimpleVT()))
2731 return AdjustCost(Entry->Cost);
2732
2733 static const TypeConversionCostTblEntry FP16Tbl[] = {
2734 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2735 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2736 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2737 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2738 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2739 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2740 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2741 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2742 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2743 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2744 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2745 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2746 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2747 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2748 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2749 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2750 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2751 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2752 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2753 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2754 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2755 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2756 };
2757
2758 if (ST->hasFullFP16())
2759 if (const auto *Entry = ConvertCostTableLookup(
2760 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2761 return AdjustCost(Entry->Cost);
2762
2763 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2766 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2768 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2770 // The standard behaviour in the backend for these cases is to split the
2771 // extend up into two parts:
2772 // 1. Perform an extending load or masked load up to the legal type.
2773 // 2. Extend the loaded data to the final type.
2774 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2775 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2777 Opcode, LegalTy, Src, CCH, CostKind, I);
2779 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2780 return Part1 + Part2;
2781 }
2782
2783 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2784 // but we also want to include the TTI::CastContextHint::Masked case too.
2785 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2787 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
2789
2790 return AdjustCost(
2791 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2792}
2793
2795 Type *Dst,
2796 VectorType *VecTy,
2797 unsigned Index) {
2798
2799 // Make sure we were given a valid extend opcode.
2800 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2801 "Invalid opcode");
2802
2803 // We are extending an element we extract from a vector, so the source type
2804 // of the extend is the element type of the vector.
2805 auto *Src = VecTy->getElementType();
2806
2807 // Sign- and zero-extends are for integer types only.
2808 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2809
2810 // Get the cost for the extract. We compute the cost (if any) for the extend
2811 // below.
2813 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2814 CostKind, Index, nullptr, nullptr);
2815
2816 // Legalize the types.
2817 auto VecLT = getTypeLegalizationCost(VecTy);
2818 auto DstVT = TLI->getValueType(DL, Dst);
2819 auto SrcVT = TLI->getValueType(DL, Src);
2820
2821 // If the resulting type is still a vector and the destination type is legal,
2822 // we may get the extension for free. If not, get the default cost for the
2823 // extend.
2824 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2825 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2826 CostKind);
2827
2828 // The destination type should be larger than the element type. If not, get
2829 // the default cost for the extend.
2830 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2831 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2832 CostKind);
2833
2834 switch (Opcode) {
2835 default:
2836 llvm_unreachable("Opcode should be either SExt or ZExt");
2837
2838 // For sign-extends, we only need a smov, which performs the extension
2839 // automatically.
2840 case Instruction::SExt:
2841 return Cost;
2842
2843 // For zero-extends, the extend is performed automatically by a umov unless
2844 // the destination type is i64 and the element type is i8 or i16.
2845 case Instruction::ZExt:
2846 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2847 return Cost;
2848 }
2849
2850 // If we are unable to perform the extend for free, get the default cost.
2851 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2852 CostKind);
2853}
2854
2857 const Instruction *I) {
2859 return Opcode == Instruction::PHI ? 0 : 1;
2860 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2861 // Branches are assumed to be predicted.
2862 return 0;
2863}
2864
2865InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2866 Type *Val,
2867 unsigned Index,
2868 bool HasRealUse) {
2869 assert(Val->isVectorTy() && "This must be a vector type");
2870
2871 if (Index != -1U) {
2872 // Legalize the type.
2873 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2874
2875 // This type is legalized to a scalar type.
2876 if (!LT.second.isVector())
2877 return 0;
2878
2879 // The type may be split. For fixed-width vectors we can normalize the
2880 // index to the new type.
2881 if (LT.second.isFixedLengthVector()) {
2882 unsigned Width = LT.second.getVectorNumElements();
2883 Index = Index % Width;
2884 }
2885
2886 // The element at index zero is already inside the vector.
2887 // - For a physical (HasRealUse==true) insert-element or extract-element
2888 // instruction that extracts integers, an explicit FPR -> GPR move is
2889 // needed. So it has non-zero cost.
2890 // - For the rest of cases (virtual instruction or element type is float),
2891 // consider the instruction free.
2892 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2893 return 0;
2894
2895 // This is recognising a LD1 single-element structure to one lane of one
2896 // register instruction. I.e., if this is an `insertelement` instruction,
2897 // and its second operand is a load, then we will generate a LD1, which
2898 // are expensive instructions.
2899 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2900 return ST->getVectorInsertExtractBaseCost() + 1;
2901
2902 // i1 inserts and extract will include an extra cset or cmp of the vector
2903 // value. Increase the cost by 1 to account.
2904 if (Val->getScalarSizeInBits() == 1)
2905 return ST->getVectorInsertExtractBaseCost() + 1;
2906
2907 // FIXME:
2908 // If the extract-element and insert-element instructions could be
2909 // simplified away (e.g., could be combined into users by looking at use-def
2910 // context), they have no cost. This is not done in the first place for
2911 // compile-time considerations.
2912 }
2913
2914 // All other insert/extracts cost this much.
2915 return ST->getVectorInsertExtractBaseCost();
2916}
2917
2920 unsigned Index, Value *Op0,
2921 Value *Op1) {
2922 bool HasRealUse =
2923 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2924 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2925}
2926
2928 Type *Val,
2930 unsigned Index) {
2931 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2932}
2933
2935 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
2937 if (isa<ScalableVectorType>(Ty))
2939 if (Ty->getElementType()->isFloatingPointTy())
2940 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
2941 CostKind);
2942 return DemandedElts.popcount() * (Insert + Extract) *
2944}
2945
2947 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2950 const Instruction *CxtI) {
2951
2952 // TODO: Handle more cost kinds.
2954 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2955 Op2Info, Args, CxtI);
2956
2957 // Legalize the type.
2958 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2959 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2960
2961 switch (ISD) {
2962 default:
2963 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2964 Op2Info);
2965 case ISD::SDIV:
2966 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2967 // On AArch64, scalar signed division by constants power-of-two are
2968 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2969 // The OperandValue properties many not be same as that of previous
2970 // operation; conservatively assume OP_None.
2972 Instruction::Add, Ty, CostKind,
2973 Op1Info.getNoProps(), Op2Info.getNoProps());
2974 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2975 Op1Info.getNoProps(), Op2Info.getNoProps());
2977 Instruction::Select, Ty, CostKind,
2978 Op1Info.getNoProps(), Op2Info.getNoProps());
2979 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2980 Op1Info.getNoProps(), Op2Info.getNoProps());
2981 return Cost;
2982 }
2983 [[fallthrough]];
2984 case ISD::UDIV: {
2985 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2986 auto VT = TLI->getValueType(DL, Ty);
2987 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2988 // Vector signed division by constant are expanded to the
2989 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2990 // to MULHS + SUB + SRL + ADD + SRL.
2992 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2994 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2996 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2997 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2998 }
2999 }
3000
3002 Opcode, Ty, CostKind, Op1Info, Op2Info);
3003 if (Ty->isVectorTy()) {
3004 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3005 // SDIV/UDIV operations are lowered using SVE, then we can have less
3006 // costs.
3007 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3008 ->getPrimitiveSizeInBits()
3009 .getFixedValue() < 128) {
3010 EVT VT = TLI->getValueType(DL, Ty);
3011 static const CostTblEntry DivTbl[]{
3012 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3013 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3014 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3015 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3016 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3017 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3018
3019 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3020 if (nullptr != Entry)
3021 return Entry->Cost;
3022 }
3023 // For 8/16-bit elements, the cost is higher because the type
3024 // requires promotion and possibly splitting:
3025 if (LT.second.getScalarType() == MVT::i8)
3026 Cost *= 8;
3027 else if (LT.second.getScalarType() == MVT::i16)
3028 Cost *= 4;
3029 return Cost;
3030 } else {
3031 // If one of the operands is a uniform constant then the cost for each
3032 // element is Cost for insertion, extraction and division.
3033 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3034 // operation with scalar type
3035 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3036 (Op2Info.isConstant() && Op2Info.isUniform())) {
3037 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3039 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3040 return (4 + DivCost) * VTy->getNumElements();
3041 }
3042 }
3043 // On AArch64, without SVE, vector divisions are expanded
3044 // into scalar divisions of each pair of elements.
3045 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3046 CostKind, Op1Info, Op2Info);
3047 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3048 Op1Info, Op2Info);
3049 }
3050
3051 // TODO: if one of the arguments is scalar, then it's not necessary to
3052 // double the cost of handling the vector elements.
3053 Cost += Cost;
3054 }
3055 return Cost;
3056 }
3057 case ISD::MUL:
3058 // When SVE is available, then we can lower the v2i64 operation using
3059 // the SVE mul instruction, which has a lower cost.
3060 if (LT.second == MVT::v2i64 && ST->hasSVE())
3061 return LT.first;
3062
3063 // When SVE is not available, there is no MUL.2d instruction,
3064 // which means mul <2 x i64> is expensive as elements are extracted
3065 // from the vectors and the muls scalarized.
3066 // As getScalarizationOverhead is a bit too pessimistic, we
3067 // estimate the cost for a i64 vector directly here, which is:
3068 // - four 2-cost i64 extracts,
3069 // - two 2-cost i64 inserts, and
3070 // - two 1-cost muls.
3071 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3072 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3073 // need to scalarize so the cost can be cheaper (smull or umull).
3074 // so the cost can be cheaper (smull or umull).
3075 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3076 return LT.first;
3077 return LT.first * 14;
3078 case ISD::ADD:
3079 case ISD::XOR:
3080 case ISD::OR:
3081 case ISD::AND:
3082 case ISD::SRL:
3083 case ISD::SRA:
3084 case ISD::SHL:
3085 // These nodes are marked as 'custom' for combining purposes only.
3086 // We know that they are legal. See LowerAdd in ISelLowering.
3087 return LT.first;
3088
3089 case ISD::FNEG:
3090 case ISD::FADD:
3091 case ISD::FSUB:
3092 // Increase the cost for half and bfloat types if not architecturally
3093 // supported.
3094 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3095 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3096 return 2 * LT.first;
3097 if (!Ty->getScalarType()->isFP128Ty())
3098 return LT.first;
3099 [[fallthrough]];
3100 case ISD::FMUL:
3101 case ISD::FDIV:
3102 // These nodes are marked as 'custom' just to lower them to SVE.
3103 // We know said lowering will incur no additional cost.
3104 if (!Ty->getScalarType()->isFP128Ty())
3105 return 2 * LT.first;
3106
3107 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3108 Op2Info);
3109 case ISD::FREM:
3110 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3111 // those functions are not declared in the module.
3112 if (!Ty->isVectorTy())
3113 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3114 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3115 Op2Info);
3116 }
3117}
3118
3120 ScalarEvolution *SE,
3121 const SCEV *Ptr) {
3122 // Address computations in vectorized code with non-consecutive addresses will
3123 // likely result in more instructions compared to scalar code where the
3124 // computation can more often be merged into the index mode. The resulting
3125 // extra micro-ops can significantly decrease throughput.
3126 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3127 int MaxMergeDistance = 64;
3128
3129 if (Ty->isVectorTy() && SE &&
3130 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3131 return NumVectorInstToHideOverhead;
3132
3133 // In many cases the address computation is not merged into the instruction
3134 // addressing mode.
3135 return 1;
3136}
3137
3139 Type *CondTy,
3140 CmpInst::Predicate VecPred,
3142 const Instruction *I) {
3143 // TODO: Handle other cost kinds.
3145 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3146 I);
3147
3148 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3149 // We don't lower some vector selects well that are wider than the register
3150 // width.
3151 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3152 // We would need this many instructions to hide the scalarization happening.
3153 const int AmortizationCost = 20;
3154
3155 // If VecPred is not set, check if we can get a predicate from the context
3156 // instruction, if its type matches the requested ValTy.
3157 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3158 CmpInst::Predicate CurrentPred;
3159 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3160 m_Value())))
3161 VecPred = CurrentPred;
3162 }
3163 // Check if we have a compare/select chain that can be lowered using
3164 // a (F)CMxx & BFI pair.
3165 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3166 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3167 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3168 VecPred == CmpInst::FCMP_UNE) {
3169 static const auto ValidMinMaxTys = {
3170 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3171 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3172 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3173
3174 auto LT = getTypeLegalizationCost(ValTy);
3175 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3176 (ST->hasFullFP16() &&
3177 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3178 return LT.first;
3179 }
3180
3181 static const TypeConversionCostTblEntry
3182 VectorSelectTbl[] = {
3183 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3184 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3185 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3186 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3187 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3188 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3189 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3190 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3191 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3192 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3193 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3194 };
3195
3196 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3197 EVT SelValTy = TLI->getValueType(DL, ValTy);
3198 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3199 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3200 SelCondTy.getSimpleVT(),
3201 SelValTy.getSimpleVT()))
3202 return Entry->Cost;
3203 }
3204 }
3205
3206 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3207 auto LT = getTypeLegalizationCost(ValTy);
3208 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3209 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3210 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3211 }
3212
3213 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3214 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3215 // be profitable.
3216 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3217 ICmpInst::isEquality(VecPred) &&
3218 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3219 match(I->getOperand(1), m_Zero()) &&
3220 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3221 return 0;
3222
3223 // The base case handles scalable vectors fine for now, since it treats the
3224 // cost as 1 * legalization cost.
3225 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3226}
3227
3229AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3231 if (ST->requiresStrictAlign()) {
3232 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3233 // a bunch of instructions when strict align is enabled.
3234 return Options;
3235 }
3236 Options.AllowOverlappingLoads = true;
3237 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3238 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3239 // TODO: Though vector loads usually perform well on AArch64, in some targets
3240 // they may wake up the FP unit, which raises the power consumption. Perhaps
3241 // they could be used with no holds barred (-O3).
3242 Options.LoadSizes = {8, 4, 2, 1};
3243 Options.AllowedTailExpansions = {3, 5, 6};
3244 return Options;
3245}
3246
3248 return ST->hasSVE();
3249}
3250
3253 Align Alignment, unsigned AddressSpace,
3255 if (useNeonVector(Src))
3256 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3257 CostKind);
3258 auto LT = getTypeLegalizationCost(Src);
3259 if (!LT.first.isValid())
3261
3262 // Return an invalid cost for element types that we are unable to lower.
3263 auto *VT = cast<VectorType>(Src);
3264 if (VT->getElementType()->isIntegerTy(1))
3266
3267 // The code-generator is currently not able to handle scalable vectors
3268 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3269 // it. This change will be removed when code-generation for these types is
3270 // sufficiently reliable.
3271 if (VT->getElementCount() == ElementCount::getScalable(1))
3273
3274 return LT.first;
3275}
3276
3277static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3278 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3279}
3280
3282 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3283 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3284 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3285 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3286 Alignment, CostKind, I);
3287 auto *VT = cast<VectorType>(DataTy);
3288 auto LT = getTypeLegalizationCost(DataTy);
3289 if (!LT.first.isValid())
3291
3292 // Return an invalid cost for element types that we are unable to lower.
3293 if (!LT.second.isVector() ||
3294 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3295 VT->getElementType()->isIntegerTy(1))
3297
3298 // The code-generator is currently not able to handle scalable vectors
3299 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3300 // it. This change will be removed when code-generation for these types is
3301 // sufficiently reliable.
3302 if (VT->getElementCount() == ElementCount::getScalable(1))
3304
3305 ElementCount LegalVF = LT.second.getVectorElementCount();
3306 InstructionCost MemOpCost =
3307 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3308 {TTI::OK_AnyValue, TTI::OP_None}, I);
3309 // Add on an overhead cost for using gathers/scatters.
3310 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
3311 // point we may want a per-CPU overhead.
3312 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3313 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3314}
3315
3317 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3318}
3319
3321 MaybeAlign Alignment,
3322 unsigned AddressSpace,
3324 TTI::OperandValueInfo OpInfo,
3325 const Instruction *I) {
3326 EVT VT = TLI->getValueType(DL, Ty, true);
3327 // Type legalization can't handle structs
3328 if (VT == MVT::Other)
3329 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3330 CostKind);
3331
3332 auto LT = getTypeLegalizationCost(Ty);
3333 if (!LT.first.isValid())
3335
3336 // The code-generator is currently not able to handle scalable vectors
3337 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3338 // it. This change will be removed when code-generation for these types is
3339 // sufficiently reliable.
3340 // We also only support full register predicate loads and stores.
3341 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3342 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3343 (VTy->getElementType()->isIntegerTy(1) &&
3344 !VTy->getElementCount().isKnownMultipleOf(
3347
3348 // TODO: consider latency as well for TCK_SizeAndLatency.
3350 return LT.first;
3351
3353 return 1;
3354
3355 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3356 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3357 // Unaligned stores are extremely inefficient. We don't split all
3358 // unaligned 128-bit stores because the negative impact that has shown in
3359 // practice on inlined block copy code.
3360 // We make such stores expensive so that we will only vectorize if there
3361 // are 6 other instructions getting vectorized.
3362 const int AmortizationCost = 6;
3363
3364 return LT.first * 2 * AmortizationCost;
3365 }
3366
3367 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3368 if (Ty->isPtrOrPtrVectorTy())
3369 return LT.first;
3370
3371 if (useNeonVector(Ty)) {
3372 // Check truncating stores and extending loads.
3373 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3374 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3375 if (VT == MVT::v4i8)
3376 return 2;
3377 // Otherwise we need to scalarize.
3378 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3379 }
3380 EVT EltVT = VT.getVectorElementType();
3381 unsigned EltSize = EltVT.getScalarSizeInBits();
3382 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3383 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3384 *Alignment != Align(1))
3385 return LT.first;
3386 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3387 // widening to v4i8, which produces suboptimal results.
3388 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3389 return LT.first;
3390
3391 // Check non-power-of-2 loads/stores for legal vector element types with
3392 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3393 // operations on smaller power-of-2 ops, including ld1/st1.
3394 LLVMContext &C = Ty->getContext();
3396 SmallVector<EVT> TypeWorklist;
3397 TypeWorklist.push_back(VT);
3398 while (!TypeWorklist.empty()) {
3399 EVT CurrVT = TypeWorklist.pop_back_val();
3400 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3401 if (isPowerOf2_32(CurrNumElements)) {
3402 Cost += 1;
3403 continue;
3404 }
3405
3406 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3407 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3408 TypeWorklist.push_back(
3409 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3410 }
3411 return Cost;
3412 }
3413
3414 return LT.first;
3415}
3416
3418 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3419 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3420 bool UseMaskForCond, bool UseMaskForGaps) {
3421 assert(Factor >= 2 && "Invalid interleave factor");
3422 auto *VecVTy = cast<VectorType>(VecTy);
3423
3424 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3426
3427 // Vectorization for masked interleaved accesses is only enabled for scalable
3428 // VF.
3429 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3431
3432 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3433 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3434 auto *SubVecTy =
3435 VectorType::get(VecVTy->getElementType(),
3436 VecVTy->getElementCount().divideCoefficientBy(Factor));
3437
3438 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3439 // Accesses having vector types that are a multiple of 128 bits can be
3440 // matched to more than one ldN/stN instruction.
3441 bool UseScalable;
3442 if (MinElts % Factor == 0 &&
3443 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3444 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3445 }
3446
3447 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3448 Alignment, AddressSpace, CostKind,
3449 UseMaskForCond, UseMaskForGaps);
3450}
3451
3456 for (auto *I : Tys) {
3457 if (!I->isVectorTy())
3458 continue;
3459 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3460 128)
3461 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3462 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3463 }
3464 return Cost;
3465}
3466
3468 return ST->getMaxInterleaveFactor();
3469}
3470
3471// For Falkor, we want to avoid having too many strided loads in a loop since
3472// that can exhaust the HW prefetcher resources. We adjust the unroller
3473// MaxCount preference below to attempt to ensure unrolling doesn't create too
3474// many strided loads.
3475static void
3478 enum { MaxStridedLoads = 7 };
3479 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3480 int StridedLoads = 0;
3481 // FIXME? We could make this more precise by looking at the CFG and
3482 // e.g. not counting loads in each side of an if-then-else diamond.
3483 for (const auto BB : L->blocks()) {
3484 for (auto &I : *BB) {
3485 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3486 if (!LMemI)
3487 continue;
3488
3489 Value *PtrValue = LMemI->getPointerOperand();
3490 if (L->isLoopInvariant(PtrValue))
3491 continue;
3492
3493 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3494 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3495 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3496 continue;
3497
3498 // FIXME? We could take pairing of unrolled load copies into account
3499 // by looking at the AddRec, but we would probably have to limit this
3500 // to loops with no stores or other memory optimization barriers.
3501 ++StridedLoads;
3502 // We've seen enough strided loads that seeing more won't make a
3503 // difference.
3504 if (StridedLoads > MaxStridedLoads / 2)
3505 return StridedLoads;
3506 }
3507 }
3508 return StridedLoads;
3509 };
3510
3511 int StridedLoads = countStridedLoads(L, SE);
3512 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3513 << " strided loads\n");
3514 // Pick the largest power of 2 unroll count that won't result in too many
3515 // strided loads.
3516 if (StridedLoads) {
3517 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3518 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3519 << UP.MaxCount << '\n');
3520 }
3521}
3522
3526 // Enable partial unrolling and runtime unrolling.
3527 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3528
3529 UP.UpperBound = true;
3530
3531 // For inner loop, it is more likely to be a hot one, and the runtime check
3532 // can be promoted out from LICM pass, so the overhead is less, let's try
3533 // a larger threshold to unroll more loops.
3534 if (L->getLoopDepth() > 1)
3535 UP.PartialThreshold *= 2;
3536
3537 // Disable partial & runtime unrolling on -Os.
3539
3540 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3543
3544 // Scan the loop: don't unroll loops with calls as this could prevent
3545 // inlining. Don't unroll vector loops either, as they don't benefit much from
3546 // unrolling.
3547 for (auto *BB : L->getBlocks()) {
3548 for (auto &I : *BB) {
3549 // Don't unroll vectorised loop.
3550 if (I.getType()->isVectorTy())
3551 return;
3552
3553 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3554 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3555 if (!isLoweredToCall(F))
3556 continue;
3557 }
3558 return;
3559 }
3560 }
3561 }
3562
3563 // Enable runtime unrolling for in-order models
3564 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3565 // checking for that case, we can ensure that the default behaviour is
3566 // unchanged
3568 !ST->getSchedModel().isOutOfOrder()) {
3569 UP.Runtime = true;
3570 UP.Partial = true;
3571 UP.UnrollRemainder = true;
3573
3574 UP.UnrollAndJam = true;
3576 }
3577}
3578
3582}
3583
3585 Type *ExpectedType) {
3586 switch (Inst->getIntrinsicID()) {
3587 default:
3588 return nullptr;
3589 case Intrinsic::aarch64_neon_st2:
3590 case Intrinsic::aarch64_neon_st3:
3591 case Intrinsic::aarch64_neon_st4: {
3592 // Create a struct type
3593 StructType *ST = dyn_cast<StructType>(ExpectedType);
3594 if (!ST)
3595 return nullptr;
3596 unsigned NumElts = Inst->arg_size() - 1;
3597 if (ST->getNumElements() != NumElts)
3598 return nullptr;
3599 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3600 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3601 return nullptr;
3602 }
3603 Value *Res = PoisonValue::get(ExpectedType);
3604 IRBuilder<> Builder(Inst);
3605 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3606 Value *L = Inst->getArgOperand(i);
3607 Res = Builder.CreateInsertValue(Res, L, i);
3608 }
3609 return Res;
3610 }
3611 case Intrinsic::aarch64_neon_ld2:
3612 case Intrinsic::aarch64_neon_ld3:
3613 case Intrinsic::aarch64_neon_ld4:
3614 if (Inst->getType() == ExpectedType)
3615 return Inst;
3616 return nullptr;
3617 }
3618}
3619
3621 MemIntrinsicInfo &Info) {
3622 switch (Inst->getIntrinsicID()) {
3623 default:
3624 break;
3625 case Intrinsic::aarch64_neon_ld2:
3626 case Intrinsic::aarch64_neon_ld3:
3627 case Intrinsic::aarch64_neon_ld4:
3628 Info.ReadMem = true;
3629 Info.WriteMem = false;
3630 Info.PtrVal = Inst->getArgOperand(0);
3631 break;
3632 case Intrinsic::aarch64_neon_st2:
3633 case Intrinsic::aarch64_neon_st3:
3634 case Intrinsic::aarch64_neon_st4:
3635 Info.ReadMem = false;
3636 Info.WriteMem = true;
3637 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3638 break;
3639 }
3640
3641 switch (Inst->getIntrinsicID()) {
3642 default:
3643 return false;
3644 case Intrinsic::aarch64_neon_ld2:
3645 case Intrinsic::aarch64_neon_st2:
3646 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3647 break;
3648 case Intrinsic::aarch64_neon_ld3:
3649 case Intrinsic::aarch64_neon_st3:
3650 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3651 break;
3652 case Intrinsic::aarch64_neon_ld4:
3653 case Intrinsic::aarch64_neon_st4:
3654 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3655 break;
3656 }
3657 return true;
3658}
3659
3660/// See if \p I should be considered for address type promotion. We check if \p
3661/// I is a sext with right type and used in memory accesses. If it used in a
3662/// "complex" getelementptr, we allow it to be promoted without finding other
3663/// sext instructions that sign extended the same initial value. A getelementptr
3664/// is considered as "complex" if it has more than 2 operands.
3666 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3667 bool Considerable = false;
3668 AllowPromotionWithoutCommonHeader = false;
3669 if (!isa<SExtInst>(&I))
3670 return false;
3671 Type *ConsideredSExtType =
3672 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3673 if (I.getType() != ConsideredSExtType)
3674 return false;
3675 // See if the sext is the one with the right type and used in at least one
3676 // GetElementPtrInst.
3677 for (const User *U : I.users()) {
3678 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3679 Considerable = true;
3680 // A getelementptr is considered as "complex" if it has more than 2
3681 // operands. We will promote a SExt used in such complex GEP as we
3682 // expect some computation to be merged if they are done on 64 bits.
3683 if (GEPInst->getNumOperands() > 2) {
3684 AllowPromotionWithoutCommonHeader = true;
3685 break;
3686 }
3687 }
3688 }
3689 return Considerable;
3690}
3691
3693 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3694 if (!VF.isScalable())
3695 return true;
3696
3697 Type *Ty = RdxDesc.getRecurrenceType();
3699 return false;
3700
3701 switch (RdxDesc.getRecurrenceKind()) {
3702 case RecurKind::Add:
3703 case RecurKind::FAdd:
3704 case RecurKind::And:
3705 case RecurKind::Or:
3706 case RecurKind::Xor:
3707 case RecurKind::SMin:
3708 case RecurKind::SMax:
3709 case RecurKind::UMin:
3710 case RecurKind::UMax:
3711 case RecurKind::FMin:
3712 case RecurKind::FMax:
3713 case RecurKind::FMulAdd:
3714 case RecurKind::IAnyOf:
3715 case RecurKind::FAnyOf:
3716 return true;
3717 default:
3718 return false;
3719 }
3720}
3721
3724 FastMathFlags FMF,
3726 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3727
3728 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3729 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3730
3731 InstructionCost LegalizationCost = 0;
3732 if (LT.first > 1) {
3733 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3734 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3735 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3736 }
3737
3738 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3739}
3740
3742 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3743 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3744 InstructionCost LegalizationCost = 0;
3745 if (LT.first > 1) {
3746 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3747 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3748 LegalizationCost *= LT.first - 1;
3749 }
3750
3751 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3752 assert(ISD && "Invalid opcode");
3753 // Add the final reduction cost for the legal horizontal reduction
3754 switch (ISD) {
3755 case ISD::ADD:
3756 case ISD::AND:
3757 case ISD::OR:
3758 case ISD::XOR:
3759 case ISD::FADD:
3760 return LegalizationCost + 2;
3761 default:
3763 }
3764}
3765
3768 std::optional<FastMathFlags> FMF,
3771 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3772 InstructionCost BaseCost =
3773 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3774 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3775 // end up vectorizing for more computationally intensive loops.
3776 return BaseCost + FixedVTy->getNumElements();
3777 }
3778
3779 if (Opcode != Instruction::FAdd)
3781
3782 auto *VTy = cast<ScalableVectorType>(ValTy);
3784 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3785 Cost *= getMaxNumElements(VTy->getElementCount());
3786 return Cost;
3787 }
3788
3789 if (isa<ScalableVectorType>(ValTy))
3790 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3791
3792 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3793 MVT MTy = LT.second;
3794 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3795 assert(ISD && "Invalid opcode");
3796
3797 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3798 // instructions as twice a normal vector add, plus 1 for each legalization
3799 // step (LT.first). This is the only arithmetic vector reduction operation for
3800 // which we have an instruction.
3801 // OR, XOR and AND costs should match the codegen from:
3802 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3803 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3804 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3805 static const CostTblEntry CostTblNoPairwise[]{
3806 {ISD::ADD, MVT::v8i8, 2},
3807 {ISD::ADD, MVT::v16i8, 2},
3808 {ISD::ADD, MVT::v4i16, 2},
3809 {ISD::ADD, MVT::v8i16, 2},
3810 {ISD::ADD, MVT::v4i32, 2},
3811 {ISD::ADD, MVT::v2i64, 2},
3812 {ISD::OR, MVT::v8i8, 15},
3813 {ISD::OR, MVT::v16i8, 17},
3814 {ISD::OR, MVT::v4i16, 7},
3815 {ISD::OR, MVT::v8i16, 9},
3816 {ISD::OR, MVT::v2i32, 3},
3817 {ISD::OR, MVT::v4i32, 5},
3818 {ISD::OR, MVT::v2i64, 3},
3819 {ISD::XOR, MVT::v8i8, 15},
3820 {ISD::XOR, MVT::v16i8, 17},
3821 {ISD::XOR, MVT::v4i16, 7},
3822 {ISD::XOR, MVT::v8i16, 9},
3823 {ISD::XOR, MVT::v2i32, 3},
3824 {ISD::XOR, MVT::v4i32, 5},
3825 {ISD::XOR, MVT::v2i64, 3},
3826 {ISD::AND, MVT::v8i8, 15},
3827 {ISD::AND, MVT::v16i8, 17},
3828 {ISD::AND, MVT::v4i16, 7},
3829 {ISD::AND, MVT::v8i16, 9},
3830 {ISD::AND, MVT::v2i32, 3},
3831 {ISD::AND, MVT::v4i32, 5},
3832 {ISD::AND, MVT::v2i64, 3},
3833 };
3834 switch (ISD) {
3835 default:
3836 break;
3837 case ISD::ADD:
3838 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3839 return (LT.first - 1) + Entry->Cost;
3840 break;
3841 case ISD::XOR:
3842 case ISD::AND:
3843 case ISD::OR:
3844 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3845 if (!Entry)
3846 break;
3847 auto *ValVTy = cast<FixedVectorType>(ValTy);
3848 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3849 isPowerOf2_32(ValVTy->getNumElements())) {
3850 InstructionCost ExtraCost = 0;
3851 if (LT.first != 1) {
3852 // Type needs to be split, so there is an extra cost of LT.first - 1
3853 // arithmetic ops.
3854 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3855 MTy.getVectorNumElements());
3856 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3857 ExtraCost *= LT.first - 1;
3858 }
3859 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3860 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3861 return Cost + ExtraCost;
3862 }
3863 break;
3864 }
3865 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3866}
3867
3869 static const CostTblEntry ShuffleTbl[] = {
3870 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3871 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3872 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3873 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3874 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3875 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3876 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3877 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3878 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3879 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3880 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3881 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3882 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3883 };
3884
3885 // The code-generator is currently not able to handle scalable vectors
3886 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3887 // it. This change will be removed when code-generation for these types is
3888 // sufficiently reliable.
3891
3892 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3893 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3895 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3896 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3897 : LT.second;
3898 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3899 InstructionCost LegalizationCost = 0;
3900 if (Index < 0) {
3901 LegalizationCost =
3902 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3904 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3906 }
3907
3908 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3909 // Cost performed on a promoted type.
3910 if (LT.second.getScalarType() == MVT::i1) {
3911 LegalizationCost +=
3912 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3914 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3916 }
3917 const auto *Entry =
3918 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3919 assert(Entry && "Illegal Type for Splice");
3920 LegalizationCost += Entry->Cost;
3921 return LegalizationCost * LT.first;
3922}
3923
3927 ArrayRef<const Value *> Args, const Instruction *CxtI) {
3928 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3929
3930 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3931 // into smaller vectors and sum the cost of each shuffle.
3932 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3933 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3934 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
3935
3936 // Check for LD3/LD4 instructions, which are represented in llvm IR as
3937 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3938 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
3939 // cost than just the load.
3940 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
3943 return std::max<InstructionCost>(1, LT.first / 4);
3944
3945 // Check for ST3/ST4 instructions, which are represented in llvm IR as
3946 // store(interleaving-shuffle). The shuffle cost could potentially be free,
3947 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
3948 // cost than just the store.
3949 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
3951 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
3953 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
3954 return LT.first;
3955
3956 unsigned TpNumElts = Mask.size();
3957 unsigned LTNumElts = LT.second.getVectorNumElements();
3958 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3959 VectorType *NTp =
3960 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3962 for (unsigned N = 0; N < NumVecs; N++) {
3963 SmallVector<int> NMask;
3964 // Split the existing mask into chunks of size LTNumElts. Track the source
3965 // sub-vectors to ensure the result has at most 2 inputs.
3966 unsigned Source1, Source2;
3967 unsigned NumSources = 0;
3968 for (unsigned E = 0; E < LTNumElts; E++) {
3969 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3971 if (MaskElt < 0) {
3973 continue;
3974 }
3975
3976 // Calculate which source from the input this comes from and whether it
3977 // is new to us.
3978 unsigned Source = MaskElt / LTNumElts;
3979 if (NumSources == 0) {
3980 Source1 = Source;
3981 NumSources = 1;
3982 } else if (NumSources == 1 && Source != Source1) {
3983 Source2 = Source;
3984 NumSources = 2;
3985 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3986 NumSources++;
3987 }
3988
3989 // Add to the new mask. For the NumSources>2 case these are not correct,
3990 // but are only used for the modular lane number.
3991 if (Source == Source1)
3992 NMask.push_back(MaskElt % LTNumElts);
3993 else if (Source == Source2)
3994 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3995 else
3996 NMask.push_back(MaskElt % LTNumElts);
3997 }
3998 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3999 // getShuffleCost. If not then cost it using the worst case.
4000 if (NumSources <= 2)
4001 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4003 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4004 else if (any_of(enumerate(NMask), [&](const auto &ME) {
4005 return ME.value() % LTNumElts == ME.index();
4006 }))
4007 Cost += LTNumElts - 1;
4008 else
4009 Cost += LTNumElts;
4010 }
4011 return Cost;
4012 }
4013
4014 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4015 // Treat extractsubvector as single op permutation.
4016 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4017 if (IsExtractSubvector && LT.second.isFixedLengthVector())
4019
4020 // Check for broadcast loads, which are supported by the LD1R instruction.
4021 // In terms of code-size, the shuffle vector is free when a load + dup get
4022 // folded into a LD1R. That's what we check and return here. For performance
4023 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4024 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4025 // that we model the load + dup sequence slightly higher because LD1R is a
4026 // high latency instruction.
4027 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4028 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4029 if (IsLoad && LT.second.isVector() &&
4031 LT.second.getVectorElementCount()))
4032 return 0;
4033 }
4034
4035 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4036 // from the perfect shuffle tables.
4037 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4038 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4039 all_of(Mask, [](int E) { return E < 8; }))
4040 return getPerfectShuffleCost(Mask);
4041
4042 // Check for identity masks, which we can treat as free.
4043 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4044 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4045 all_of(enumerate(Mask), [](const auto &M) {
4046 return M.value() < 0 || M.value() == (int)M.index();
4047 }))
4048 return 0;
4049
4050 // Check for other shuffles that are not SK_ kinds but we have native
4051 // instructions for, for example ZIP and UZP.
4052 unsigned Unused;
4053 if (LT.second.isFixedLengthVector() &&
4054 LT.second.getVectorNumElements() == Mask.size() &&
4055 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4056 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4057 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4058 // Check for non-zero lane splats
4059 all_of(drop_begin(Mask),
4060 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4061 return 1;
4062
4063 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4064 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4065 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4066 static const CostTblEntry ShuffleTbl[] = {
4067 // Broadcast shuffle kinds can be performed with 'dup'.
4068 {TTI::SK_Broadcast, MVT::v8i8, 1},
4069 {TTI::SK_Broadcast, MVT::v16i8, 1},
4070 {TTI::SK_Broadcast, MVT::v4i16, 1},
4071 {TTI::SK_Broadcast, MVT::v8i16, 1},
4072 {TTI::SK_Broadcast, MVT::v2i32, 1},
4073 {TTI::SK_Broadcast, MVT::v4i32, 1},
4074 {TTI::SK_Broadcast, MVT::v2i64, 1},
4075 {TTI::SK_Broadcast, MVT::v4f16, 1},
4076 {TTI::SK_Broadcast, MVT::v8f16, 1},
4077 {TTI::SK_Broadcast, MVT::v2f32, 1},
4078 {TTI::SK_Broadcast, MVT::v4f32, 1},
4079 {TTI::SK_Broadcast, MVT::v2f64, 1},
4080 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4081 // 'zip1/zip2' instructions.
4082 {TTI::SK_Transpose, MVT::v8i8, 1},
4083 {TTI::SK_Transpose, MVT::v16i8, 1},
4084 {TTI::SK_Transpose, MVT::v4i16, 1},
4085 {TTI::SK_Transpose, MVT::v8i16, 1},
4086 {TTI::SK_Transpose, MVT::v2i32, 1},
4087 {TTI::SK_Transpose, MVT::v4i32, 1},
4088 {TTI::SK_Transpose, MVT::v2i64, 1},
4089 {TTI::SK_Transpose, MVT::v4f16, 1},
4090 {TTI::SK_Transpose, MVT::v8f16, 1},
4091 {TTI::SK_Transpose, MVT::v2f32, 1},
4092 {TTI::SK_Transpose, MVT::v4f32, 1},
4093 {TTI::SK_Transpose, MVT::v2f64, 1},
4094 // Select shuffle kinds.
4095 // TODO: handle vXi8/vXi16.
4096 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4097 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4098 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4099 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4100 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4101 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4102 // PermuteSingleSrc shuffle kinds.
4103 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4104 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4105 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4106 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4107 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4108 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4109 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4110 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4111 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4112 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4113 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4114 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4115 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4116 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4117 // Reverse can be lowered with `rev`.
4118 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4119 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4120 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4121 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4122 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4123 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4124 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4125 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4126 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4127 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4128 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4129 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4130 // Splice can all be lowered as `ext`.
4131 {TTI::SK_Splice, MVT::v2i32, 1},
4132 {TTI::SK_Splice, MVT::v4i32, 1},
4133 {TTI::SK_Splice, MVT::v2i64, 1},
4134 {TTI::SK_Splice, MVT::v2f32, 1},
4135 {TTI::SK_Splice, MVT::v4f32, 1},
4136 {TTI::SK_Splice, MVT::v2f64, 1},
4137 {TTI::SK_Splice, MVT::v8f16, 1},
4138 {TTI::SK_Splice, MVT::v8bf16, 1},
4139 {TTI::SK_Splice, MVT::v8i16, 1},
4140 {TTI::SK_Splice, MVT::v16i8, 1},
4141 {TTI::SK_Splice, MVT::v4bf16, 1},
4142 {TTI::SK_Splice, MVT::v4f16, 1},
4143 {TTI::SK_Splice, MVT::v4i16, 1},
4144 {TTI::SK_Splice, MVT::v8i8, 1},
4145 // Broadcast shuffle kinds for scalable vectors
4146 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4147 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4148 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4149 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4150 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4151 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4152 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4153 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4154 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4155 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4156 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4157 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4158 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4159 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4160 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4161 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4162 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4163 // Handle the cases for vector.reverse with scalable vectors
4164 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4165 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4166 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4167 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4168 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4169 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4170 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4171 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4172 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4173 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4174 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4175 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4176 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4177 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4178 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4179 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4180 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4181 };
4182 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4183 return LT.first * Entry->Cost;
4184 }
4185
4186 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4187 return getSpliceCost(Tp, Index);
4188
4189 // Inserting a subvector can often be done with either a D, S or H register
4190 // move, so long as the inserted vector is "aligned".
4191 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4192 LT.second.getSizeInBits() <= 128 && SubTp) {
4193 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4194 if (SubLT.second.isVector()) {
4195 int NumElts = LT.second.getVectorNumElements();
4196 int NumSubElts = SubLT.second.getVectorNumElements();
4197 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4198 return SubLT.first;
4199 }
4200 }
4201
4202 // Restore optimal kind.
4203 if (IsExtractSubvector)
4205 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4206 CxtI);
4207}
4208
4211 const auto &Strides = DenseMap<Value *, const SCEV *>();
4212 for (BasicBlock *BB : TheLoop->blocks()) {
4213 // Scan the instructions in the block and look for addresses that are
4214 // consecutive and decreasing.
4215 for (Instruction &I : *BB) {
4216 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4218 Type *AccessTy = getLoadStoreType(&I);
4219 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4220 /*ShouldCheckWrap=*/false)
4221 .value_or(0) < 0)
4222 return true;
4223 }
4224 }
4225 }
4226 return false;
4227}
4228
4230 if (!ST->hasSVE())
4231 return false;
4232
4233 // We don't currently support vectorisation with interleaving for SVE - with
4234 // such loops we're better off not using tail-folding. This gives us a chance
4235 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4236 if (TFI->IAI->hasGroups())
4237 return false;
4238
4240 if (TFI->LVL->getReductionVars().size())
4241 Required |= TailFoldingOpts::Reductions;
4242 if (TFI->LVL->getFixedOrderRecurrences().size())
4243 Required |= TailFoldingOpts::Recurrences;
4244
4245 // We call this to discover whether any load/store pointers in the loop have
4246 // negative strides. This will require extra work to reverse the loop
4247 // predicate, which may be expensive.
4250 Required |= TailFoldingOpts::Reverse;
4251 if (Required == TailFoldingOpts::Disabled)
4252 Required |= TailFoldingOpts::Simple;
4253
4255 Required))
4256 return false;
4257
4258 // Don't tail-fold for tight loops where we would be better off interleaving
4259 // with an unpredicated loop.
4260 unsigned NumInsns = 0;
4261 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4262 NumInsns += BB->sizeWithoutDebug();
4263 }
4264
4265 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4266 return NumInsns >= SVETailFoldInsnThreshold;
4267}
4268
4271 StackOffset BaseOffset, bool HasBaseReg,
4272 int64_t Scale, unsigned AddrSpace) const {
4273 // Scaling factors are not free at all.
4274 // Operands | Rt Latency
4275 // -------------------------------------------
4276 // Rt, [Xn, Xm] | 4
4277 // -------------------------------------------
4278 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4279 // Rt, [Xn, Wm, <extend> #imm] |
4281 AM.BaseGV = BaseGV;
4282 AM.BaseOffs = BaseOffset.getFixed();
4283 AM.HasBaseReg = HasBaseReg;
4284 AM.Scale = Scale;
4285 AM.ScalableOffset = BaseOffset.getScalable();
4286 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4287 // Scale represents reg2 * scale, thus account for 1 if
4288 // it is not equal to 0 or 1.
4289 return AM.Scale != 0 && AM.Scale != 1;
4290 return -1;
4291}
4292
4294 // For the binary operators (e.g. or) we need to be more careful than
4295 // selects, here we only transform them if they are already at a natural
4296 // break point in the code - the end of a block with an unconditional
4297 // terminator.
4298 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4299 isa<BranchInst>(I->getNextNode()) &&
4300 cast<BranchInst>(I->getNextNode())->isUnconditional())
4301 return true;
4303}
4304
4306 const TargetTransformInfo::LSRCost &C2) {
4307 // AArch64 specific here is adding the number of instructions to the
4308 // comparison (though not as the first consideration, as some targets do)
4309 // along with changing the priority of the base additions.
4310 // TODO: Maybe a more nuanced tradeoff between instruction count
4311 // and number of registers? To be investigated at a later date.
4312 if (EnableLSRCostOpt)
4313 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4314 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4315 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4316 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4317
4319}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
uint64_t IntrinsicInst * II
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:77
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:428
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1628
unsigned countLeadingOnes() const
Definition: APInt.h:1582
void negate()
Negate this APInt in place.
Definition: APInt.h:1429
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1718
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:806
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:419
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:588
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:246
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
bool isIntPredicate() const
Definition: InstrTypes.h:865
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1650
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:914
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:88
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2470
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1043
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2458
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:537
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:557
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1192
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1090
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:524
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:542
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:309
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:529
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1864
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:489
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2203
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2395
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2125
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1788
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2492
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1801
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:552
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1664
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2194
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:178
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:47
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:386
BuilderTy & Builder
Definition: InstCombiner.h:60
static InstructionCost getInvalid(CostType Val=0)
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:659
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:173
Value * getPointerOperand()
Definition: Instructions.h:252
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:627
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:289
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:693
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:926
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:972
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:921
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
InstructionCost Cost
@ Default
The result values are uniform if and only if all operands are uniform.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:360
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55